### Binary customer reviews classifier
This project classifies a review into: a review that talks about the documentation vs a review that doesn't

In [160]:
import pandas as pd

# Load the dataset
df = pd.read_csv('data.csv')
df.describe()

Unnamed: 0,Comment,Label
count,792,792
unique,648,2
top,subscribe to my blog!,False
freq,110,587


In [161]:
# Preprocess the strings:
# 1. Remove special characters and singles stranded characters like s and m after ' removal
# 2. Remove single characters, replace multiple spaces with single space and convert to lower case
df['Comment'] = df['Comment'].str.replace(r'\W', ' ').str.replace(r'\s+[a-zA-Z0-9]\s+', ' ')
df['Comment'] = df['Comment'].str.replace(r'^[a-zA-Z0-9]\s+', '').str.replace(r'\s+', ' ').str.lower()

In [162]:
# 3. Drop duplicate comments
df.drop_duplicates(subset ="Comment", inplace = True)

In [163]:
# Lemmatize the words
import nltk

def lemmatize(sentence):
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmatized = []
    for word in tokenizer.tokenize(sentence):
        lemmatized.append(lemmatizer.lemmatize(word))
    return " ".join(lemmatized)
df['Comment'] = df['Comment'].apply(lemmatize)

In [164]:
# Convert to TFIDF feature values
from sklearn.feature_extraction.text import TfidfVectorizer  
tfidfconverter = TfidfVectorizer(stop_words=stopwords.words('english'))  
vectorized = tfidfconverter.fit_transform(df['Comment']).toarray()  


In [165]:
# Split into train and test sets
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(vectorized, df['Label'], test_size=0.2, random_state=0)

In [166]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10, random_state=0)  
classifier.fit(X_train, y_train)  

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [167]:
# Predict the test set
y_pred = classifier.predict(X_test) 

In [168]:
from sklearn.metrics import classification_report, accuracy_score

# Evaluate the model
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

              precision    recall  f1-score   support

       False       0.91      1.00      0.95       110
        True       1.00      0.27      0.42        15

   micro avg       0.91      0.91      0.91       125
   macro avg       0.95      0.63      0.69       125
weighted avg       0.92      0.91      0.89       125

0.912


In [169]:
# create a baseline
from sklearn.dummy import DummyClassifier
dummy_classifier = DummyClassifier(strategy="stratified")
dummy_classifier.fit(X_train, y_train)
baseline_pred = dummy_classifier.predict(X_test)

In [170]:
# Evaluate the dummy classifier
print(classification_report(y_test,baseline_pred))  
print(accuracy_score(y_test, baseline_pred)) 

              precision    recall  f1-score   support

       False       0.87      0.85      0.86       110
        True       0.06      0.07      0.06        15

   micro avg       0.76      0.76      0.76       125
   macro avg       0.46      0.46      0.46       125
weighted avg       0.77      0.76      0.77       125

0.76


In [171]:
# Model performs better than baseline, woot!