In [None]:
# Basic Libraries
import pandas as pd
import numpy as np

# NLTK Libraries
import nltk
import re
import string
from wordcloud import WordCloud, STOPWORDS
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Machine Learning libraries
import sklearn 
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn import svm, datasets
from sklearn import preprocessing 


#Metrics libraries
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc

#Visualization libraries
import matplotlib.pyplot as plt 
from matplotlib import rcParams
import seaborn as sns
from textblob import TextBlob
from plotly import tools
import plotly.graph_objs as go
from plotly.offline import iplot
%matplotlib inline

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

#Other miscellaneous libraries
from scipy import interp
from itertools import cycle
import cufflinks as cf
from collections import defaultdict
from collections import Counter
from imblearn.over_sampling import SMOTE

In [None]:
raw_reviews = pd.read_csv("/kaggle/input/amazon-music-reviews/Musical_instruments_reviews.csv")

print("The shape of the data is (row, column): " + str(raw_reviews.shape))
print(raw_reviews.info())

In [None]:
raw_reviews.head()

# Handling NaN values

In [None]:
process_reviews = raw_reviews.copy()

process_reviews.isnull().sum()

In [None]:
process_reviews['reviewText'] = process_reviews['reviewText'].fillna("Missing")

In [None]:
process_reviews["reviews"] = process_reviews["reviewText"]+process_reviews["summary"]
process_reviews = process_reviews.drop(["reviewText", "summary"], axis=1)
process_reviews.head()

In [None]:
process_reviews['overall'].value_counts()

In [None]:
def f(row):
    '''This function returns sentiment value based on the overall ratings from the user'''
    if row['overall'] == 3.0:
        val = "Neutral"
    elif row['overall'] == 1.0 or row['overall'] == 2.0:
        val = "Negative"
    elif row['overall'] == 4.0 or row['overall'] == 5.0:
        val = "Positive"
    else:
        val = -1
    return val

In [None]:
process_reviews["sentiment"] = process_reviews.apply(f, axis=1)
process_reviews.head()

In [None]:
process_reviews["sentiment"].value_counts()

In [None]:
new = process_reviews["reviewTime"].str.split(",", n=1, expand=True)
process_reviews["date"] = new[0]
process_reviews["year"] = new[1]
process_reviews = process_reviews.drop(["reviewTime"], axis=1)
process_reviews.head()

In [None]:
new1 = process_reviews["date"].str.split(" ", n=1, expand=True)
process_reviews["month"] = new1[0]
process_reviews["day"] = new1[1]

process_reviews = process_reviews.drop(["date"], axis=1)
process_reviews.head()

In [None]:
process_reviews["reviews"][1]

In [None]:
new1 = process_reviews["helpful"].str.split(',', n=1, expand=True)
new2 = new1[0].str.split('[', n=1, expand=True)
new3 = new1[1].str.split(']', n=1, expand=True)

new2.reset_index(drop=True, inplace=True)
new3.reset_index(drop=True, inplace=True)

new2 = new2.drop([0], axis=1)
new3 = new3.drop([1], axis=1)

helpful = pd.concat([new2, new3], axis=1)

def trim_all_columns(df):
    '''
    Trim whitespace from ends of each value across all series in dataframe
    '''
    trim_strings = lambda x: x.strip() if isinstance(x, str) else x
    return df.applymap(trim_strings)

helpful = trim_all_columns(helpful)

helpful[0] = helpful[0].astype(str).astype(int)
helpful[1] = helpful[1].astype(str).astype(int)

try:
    helpful['result'] = helpful[1]/helpful[0]
except ZeroDivisionError:
    helpful['result'] = 0

helpful['result'] = helpful['result'].fillna(0)
helpful['result'] = helpful['result'].round(2)

process_reviews['helpful_rate'] = helpful['result']

process_reviews = process_reviews.drop(['helpful'], axis=1)

In [None]:
process_reviews.head()

In [None]:
process_reviews["helpful_rate"].value_counts()

In [None]:
process_reviews = process_reviews.drop(['reviewerName', 'unixReviewTime'], axis=1)
clean_reviews = process_reviews.copy()

In [None]:
clean_reviews.head()

In [None]:
def review_cleaning(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
process_reviews['reviews'] = process_reviews['reviews'].apply(lambda x:review_cleaning(x))
process_reviews.head()

In [None]:
process_reviews.reviews[50]

In [None]:
stop_words= ['yourselves', 'between', 'whom', 'itself', 'is', "she's", 'up', 'herself', 'here', 'your', 'each', 
             'we', 'he', 'my', "you've", 'having', 'in', 'both', 'for', 'themselves', 'are', 'them', 'other',
             'and', 'an', 'during', 'their', 'can', 'yourself', 'she', 'until', 'so', 'these', 'ours', 'above', 
             'what', 'while', 'have', 're', 'more', 'only', "needn't", 'when', 'just', 'that', 'were', "don't", 
             'very', 'should', 'any', 'y', 'isn', 'who',  'a', 'they', 'to', 'too', "should've", 'has', 'before',
             'into', 'yours', "it's", 'do', 'against', 'on',  'now', 'her', 've', 'd', 'by', 'am', 'from', 
             'about', 'further', "that'll", "you'd", 'you', 'as', 'how', 'been', 'the', 'or', 'doing', 'such',
             'his', 'himself', 'ourselves',  'was', 'through', 'out', 'below', 'own', 'myself', 'theirs', 
             'me', 'why', 'once',  'him', 'than', 'be', 'most', "you'll", 'same', 'some', 'with', 'few', 'it',
             'at', 'after', 'its', 'which', 'there','our', 'this', 'hers', 'being', 'did', 'of', 'had', 'under',
             'over','again', 'where', 'those', 'then', "you're", 'i', 'because', 'does', 'all']

In [None]:
process_reviews['reviews'] = process_reviews['reviews'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop_words)]))
process_reviews.reviews

In [None]:
pd.DataFrame(process_reviews.groupby('sentiment')['helpful_rate'].mean())

In [None]:
plt.rcParams.update({'font.size': 18})
rcParams['figure.figsize'] = 16, 9

senti_help = pd.DataFrame(process_reviews, columns=['sentiment', 'helpful_rate'])
senti_help = senti_help[senti_help['helpful_rate'] != 0.0]

sns.violinplot(x=senti_help["sentiment"], y=senti_help["helpful_rate"])
plt.title('Sentiment vs Helpfulness')
plt.xlabel('Sentiment Categories')
plt.ylabel('Helpful Rate')
plt.show()

In [None]:
process_reviews.groupby(['year', 'sentiment'])['sentiment'].count().unstack().plot(legend=True)
plt.title("Year and Sentiment count")
plt.xlabel("Year")
plt.ylabel("Sentiment Count")
plt.show()

In [None]:
day = pd.DataFrame(process_reviews.groupby('day')['reviews'].count()).reset_index()
day['day'] = day['day'].astype('int64')
day.sort_values(by=['day'])

sns.barplot(x='day', y='reviews', data=day)
plt.title("Day vs Reviews Count")
plt.xlabel("Day")
plt.ylabel("Reviews Count")
plt.show()

In [None]:
process_reviews['polarity'] = process_reviews['reviews'].map(lambda text: TextBlob(text).sentiment.polarity)
process_reviews['review_len'] = process_reviews['reviews'].astype(str).apply(len)
process_reviews['word_count'] = process_reviews['reviews'].apply(lambda x: len(str(x).split()))

In [None]:
process_reviews.head()

In [None]:
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [None]:
process_reviews['polarity'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution'
    )

In [None]:
process_reviews['overall'].iplot(
    kind='hist',
    xTitle='rating',
    linecolor='black',
    yTitle='count',
    title='Review Rating Distribution')

In [None]:
process_reviews['review_len'].iplot(
    kind='hist',
    bins=50,
    xTitle='review length',
    yTitle='count',
    linecolor='black',
    title="Review Text Length Distribution"
    )

In [None]:
process_reviews['word_count'].iplot(
    kind='hist',
    bins=100,
    xTitle='word count',
    linecolor='black',
    yTitle='count',
    title='Review Text Word Count Distribution')

# Unigram Analysis

In [None]:
review_pos = process_reviews[process_reviews['sentiment']=="Positive"].dropna()
review_neu = process_reviews[process_reviews['sentiment']=="Neutral"].dropna()
review_neg = process_reviews[process_reviews['sentiment']=="Negative"].dropna()

def generate_ngrams(text, n_gram=1):
    token = [token for token in text.lower().split(" ") if token != "" if token not in STOPWORDS]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [" ".join(ngram) for ngram in ngrams]

def horizontal_bar_chart(df, color):
    trace = go.Bar(
        y=df["word"].values[::-1],
        x=df["wordcount"].values[::-1],
        showlegend=False,
        orientation = 'h',
        marker=dict(color=color,),
        )
    return trace

freq_dict = defaultdict(int)
for sent in review_pos["reviews"]:
    for word in generate_ngrams(sent):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace0 = horizontal_bar_chart(fd_sorted.head(25), 'green')

freq_dict = defaultdict(int)
for sent in review_neu["reviews"]:
    for word in generate_ngrams(sent):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(25), 'grey')

freq_dict = defaultdict(int)
for sent in review_neg["reviews"]:
    for word in generate_ngrams(sent):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace2 = horizontal_bar_chart(fd_sorted.head(25), 'red')

fig = tools.make_subplots(rows=3, cols=1, vertical_spacing=0.04, subplot_titles=["Frequent words of positive reviews", "Frequent words of neutral reviews","Frequent words of negative reviews"])
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 2, 1)
fig.append_trace(trace2, 3, 1)
fig['layout'].update(height=1200, width= 900, paper_bgcolor='rgb(233, 233, 233)', title='Word Count Plots')
iplot(fig, filename='word-plots')

# Bigram Analysis

In [None]:
freq_dict = defaultdict(int)
for sent in review_pos["reviews"]:
    for word in generate_ngrams(sent, 2):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x:x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace0 = horizontal_bar_chart(fd_sorted.head(25), 'green')

freq_dict = defaultdict(int)
for sent in review_neu["reviews"]:
    for word in generate_ngrams(sent, 2):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x:x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(25), 'grey')

freq_dict = defaultdict(int)
for sent in review_neg["reviews"]:
    for word in generate_ngrams(sent, 2):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x:x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace2 = horizontal_bar_chart(fd_sorted.head(25), 'brown')
    


fig = tools.make_subplots(rows=3, cols=1, vertical_spacing=0.04,horizontal_spacing=0.25,
                          subplot_titles=["Bigram plots of Positive reviews", 
                                          "Bigram plots of Neutral reviews",
                                          "Bigram plots of Negative reviews"
                                          ])
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 2, 1)
fig.append_trace(trace2, 3, 1)

fig['layout'].update(height=1000, width=800, paper_bgcolor='rgb(233,233,233)', title="Bigram Plots")
iplot(fig, filename='word-plots')

# Trigram Analysis

In [None]:
freq_dict = defaultdict(int)
for sent in review_pos["reviews"]:
    for word in generate_ngrams(sent, 3):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x:x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace0 = horizontal_bar_chart(fd_sorted.head(25), 'green')

freq_dict = defaultdict(int)
for sent in review_neu["reviews"]:
    for word in generate_ngrams(sent, 3):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x:x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(25), 'grey')

freq_dict = defaultdict(int)
for sent in review_neg["reviews"]:
    for word in generate_ngrams(sent, 3):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x:x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace2 = horizontal_bar_chart(fd_sorted.head(25), 'brown')
    


fig = tools.make_subplots(rows=3, cols=1, vertical_spacing=0.04,horizontal_spacing=0.25,
                          subplot_titles=["Bigram plots of Positive reviews", 
                                          "Bigram plots of Neutral reviews",
                                          "Bigram plots of Negative reviews"
                                          ])
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 2, 1)
fig.append_trace(trace2, 3, 1)

fig['layout'].update(height=1000, width=800, paper_bgcolor='rgb(233,233,233)', title="Bigram Plots")
iplot(fig, filename='word-plots')

## Wordcloud 

### Positive wordcloud

In [None]:
text = review_pos["reviews"]
wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = STOPWORDS
    ).generate(str(text))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k'
    )
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

### Neutral Wordcloud

In [None]:
text = review_neu["reviews"]
wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

### Negative wordcloud

In [None]:
text = review_neg["reviews"]
wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = STOPWORDS
    ).generate(str(text))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k'
    )
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
process_reviews.head()

In [None]:
label_encoder = preprocessing.LabelEncoder()

process_reviews['sentiment'] = label_encoder.fit_transform(process_reviews["sentiment"])

process_reviews['sentiment'].unique()

### 2 - Positive
### 1 - Neutral
### 0 - Negative

In [None]:
process_reviews['sentiment'].value_counts()


In [None]:
review_features = process_reviews.copy()
review_features = review_features[['reviews']].reset_index(drop=True)
review_features.head()

In [None]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(review_features)):
    review =  re.sub('[^a-zA-Z]', ' ', review_features['reviews'][i])
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stop_words]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus[10]

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features = 2000, ngram_range=(1,2))
vectorizer = tfidf_vectorizer.fit(review_features['reviews'])
X = vectorizer.transform(review_features['reviews'])

In [None]:
X

In [None]:
y=process_reviews['sentiment']

In [None]:
print(f"Original dataset shape: {Counter(y)}")

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X,y)

print(f"Resampled dataset shape: {Counter(y_res)}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.25, random_state=0)

In [None]:
def plot_confusion_matrix(cm, classes, normalize=False, title="Confusion matrix", cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalized = True`
    """
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    if normalize:
        cm = cm.astype('float')/cm.sum(axis=1)[:, np.newaxis]
        print('Normalized confusion matrix')
    else:
        print("Confusion matrix, without normalization")
        
    thresh = cm.max()/2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j]>thresh else "black")
            
    plt.tight_layout()
    plt.ylabel("True Label")
    plt.xlabel('predicted label')
    

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf = rf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, rf.predict(X_test))


In [None]:
from sklearn.metrics import f1_score
f1_score(y_test, rf.predict(X_test), average='macro')

In [None]:
def preprocess(sentence):
    sentence = review_cleaning(sentence)
    sentence = ' '.join([word for word in sentence.split() if word not in (stop_words)])
    sentence =  re.sub('[^a-zA-Z]', ' ', sentence)
    sentence = sentence.split()
    sentence = [ps.stem(word) for word in sentence if not word in stop_words]
    sentence = ' '.join(sentence)
    return sentence

In [None]:
def vectorize(sentence):
    return vectorizer.transform([sentence])


In [None]:
sentence = "The guitar strings looked good but every string was broken. Worst Experience. Wouldn't Recommend to anyone. The guitar strings looked good but every string was broken. Worst Experience. Wouldn't Recommend to anyone."
print(preprocess(sentence))
X_check = vectorize(preprocess(sentence))

## Model Selection

In [None]:
logreg_cv = LogisticRegression(random_state=0)
dt_cv = DecisionTreeClassifier()
knn_cv = KNeighborsClassifier()
svc_cv = SVC()
nb_cv = BernoulliNB()

cv_dict = {0: "Logistic Regression", 1: "Decision Tree", 2: "KNN", 3:"SVC", 4: "Naive Bayes"}
cv_models = [logreg_cv, dt_cv, knn_cv, svc_cv, nb_cv]

for i, model in enumerate(cv_models):
    print("{} Test Accuracy: {}".format(cv_dict[i], cross_val_score(model, X, y, cv=10, scoring='accuracy').mean()))

In [None]:
param_grid = {"C":np.logspace(-4, 4, 50),
             'penalty': ['l1', 'l2']}
clf = GridSearchCV(LogisticRegression(random_state=0, max_iter=100), param_grid, cv=5, verbose=0, n_jobs=-1)
best_model = clf.fit(X_train, y_train)
print(best_model.best_estimator_)
print("The mean accuracy of the model is: ", best_model.score(X_test, y_test))

In [None]:
logreg = LogisticRegression(C=10000.0, random_state=0)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print("Accuracy of logistic regression classifier on test set: {:.2f}".format(logreg.score(X_test, y_test)))

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, classes=['Negative', 'Neutral', 'Positive'])

In [None]:
print("Classification Report:\n", classification_report(y_test, y_pred
                                                       ))

In [None]:
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=10))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)


In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
        label = 'micro-average ROC curve (area = {0:0.2f})'
         ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4
        )
plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=4,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()

In [None]:
sentence = "Worst Instrument"
print(len(sentence))

In [None]:
if len(sentence) < 100:
    sentence = sentence * 2
    print(sentence)
    X_check = vectorize(preprocess(sentence))
else:
    X_check = vectorize(preprocess(sentence))

In [None]:
logreg.predict(X_check)

In [None]:
import pickle

file = open('vectoriser-ngram-(2,2).pickle','wb')
pickle.dump(vectorizer, file)
file.close()

In [None]:


file = open('logreg.pickle','wb')
pickle.dump(logreg, file)
file.close()