### Binary customer reviews classifier
This project classifies a review into: a review that talks about the documentation vs a review that doesn't

In [132]:
import pandas as pd

# Load the dataset
df = pd.read_csv('data.csv')
df.describe()

Unnamed: 0,Comment,Label
count,792,792
unique,648,2
top,subscribe to my blog!,False
freq,110,587


In [133]:
# Preprocess the strings:
# 1. Remove special characters and singles stranded characters like s and m after ' removal
# 2. Remove single characters, replace multiple spaces with single space and convert to lower case
df['Comment'] = df['Comment'].str.replace(r'\W', ' ').str.replace(r'\s+[a-zA-Z0-9]\s+', ' ')
df['Comment'] = df['Comment'].str.replace(r'^[a-zA-Z0-9]\s+', '').str.replace(r'\s+', ' ').str.lower()

In [134]:
# 3. Drop duplicate comments
df.drop_duplicates(subset ="Comment", inplace = True)
df

Unnamed: 0,Comment,Label
0,please provide more direct way to get to the k...,True
1,confused about licences want mix of them,False
2,more logic tutorial,True
3,add feature,False
4,integrate with beacon technology and embedded ...,False
5,pretty good,False
6,subscribe to my blog,True
7,all good,False
8,is very intuitive,False
19,great team of responsive peeps love the product,False


In [135]:
# Lemmatize the words
import nltk

def lemmatize(sentence):
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmatized = []
    for word in tokenizer.tokenize(sentence):
        lemmatized.append(lemmatizer.lemmatize(word))
    return " ".join(lemmatized)
df['Comment'] = df['Comment'].apply(lemmatize)
df

Unnamed: 0,Comment,Label
0,please provide more direct way to get to the k...,True
1,confused about licence want mix of them,False
2,more logic tutorial,True
3,add feature,False
4,integrate with beacon technology and embedded ...,False
5,pretty good,False
6,subscribe to my blog,True
7,all good,False
8,is very intuitive,False
19,great team of responsive peep love the product,False


In [136]:
df['Comment']

0      please provide more direct way to get to the k...
1                confused about licence want mix of them
2                                    more logic tutorial
3                                            add feature
4      integrate with beacon technology and embedded ...
5                                            pretty good
6                                   subscribe to my blog
7                                               all good
8                                      is very intuitive
19        great team of responsive peep love the product
20                quicker support response chat function
21     provide help tip at every field option with ex...
22                                         excellent job
23     remove the requirement of offline access to th...
24     it doesn show the number if it is result of fo...
25                                   graphical interface
26                                              good app
29                         it e

In [137]:
# Convert to TFIDF feature values
from sklearn.feature_extraction.text import TfidfVectorizer  
tfidfconverter = TfidfVectorizer(stop_words=stopwords.words('english'))  
vectorized = tfidfconverter.fit_transform(df['Comment']).toarray()  


In [138]:
# Split into train and test sets
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(vectorized, df['Label'], test_size=0.2, random_state=0)

In [139]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10, random_state=0)  
classifier.fit(X_train, y_train)  

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [140]:
# Predict the test set
y_pred = classifier.predict(X_test) 

In [141]:
from sklearn.metrics import classification_report, accuracy_score

# Evaluate the model
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

              precision    recall  f1-score   support

       False       0.91      1.00      0.95       110
        True       1.00      0.27      0.42        15

   micro avg       0.91      0.91      0.91       125
   macro avg       0.95      0.63      0.69       125
weighted avg       0.92      0.91      0.89       125

0.912


In [146]:
# create a baseline
from sklearn.dummy import DummyClassifier
dummy_classifier = DummyClassifier(strategy="stratified")
dummy_classifier.fit(X_train, y_train)
baseline_pred = dummy_classifier.predict(X_test)

In [147]:
# Evaluate the dummy classifier
print(classification_report(y_test,baseline_pred))  
print(accuracy_score(y_test, baseline_pred)) 

              precision    recall  f1-score   support

       False       0.89      0.85      0.87       110
        True       0.16      0.20      0.18        15

   micro avg       0.78      0.78      0.78       125
   macro avg       0.52      0.53      0.52       125
weighted avg       0.80      0.78      0.79       125

0.776


In [None]:
# Model performs better than baseline, woot!