## Imports

In [1]:
import pandas as pd
import numpy as np
from time import time
from pprint import pprint
#plotting, !pip install plotly
import plotly.offline as pyo
import plotly.graph_objs as go
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, recall_score,precision_score, roc_auc_score, roc_curve, auc

## Functions

In [17]:
def make_grid_search(pipeline, parameters):
    
    # find the best parameters for both the feature extraction and the classifier
    grid_search = GridSearchCV(pipeline, parameters, cv=3, scoring = "roc_auc", n_jobs=-1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred, average = "micro")
    precision = precision_score(y_test, y_pred, average = "micro")
    f1_macro = f1_score(y_test, y_pred,average="macro")
    rocauc = roc_auc_score(y_test, y_pred, average="micro")
    print("Accuracy: " + str(accuracy) +
      "\nRecall " + str(recall) +
      "\nPrecision " + str(precision) +
      "\nF1-score " + str(f1_macro) +
      "\nROC_AUC " + str(rocauc))

    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    
    print(classification_report(y_test,y_pred,zero_division=False))

    return grid_search

## Loading the data

In [3]:
#replace this with colab path
df = pd.read_csv('/Users/niklastodenhoefer/Downloads/hair.csv')

In [4]:
df.tail(7)

Unnamed: 0.1,Unnamed: 0,text_generation_preprocessed,text_preprocessed_for_sentiment,target_sentiment_binary,cluster
76515,516897,"personally, cant really tell difference from r...","['person', 'tell', 'differ', 'regular', 'drugs...",positive,2
76516,516898,my scalp was itching insanely before started u...,"['scalp', 'itch', 'insan', 'start', 'shampoo',...",positive,2
76517,516901,my doctor prescribed this and the shampoo and ...,"['doctor', 'prescrib', 'shampoo', 'sprai', 'he...",negative,2
76518,516915,"best soap have ever purchased, hands down. my ...","['best', 'soap', 'purchas', 'hand', 'wife', 'a...",positive,2
76519,516945,bought this in december and loved how my hair ...,"['bought', 'decemb', 'love', 'hair', 'look', '...",negative,2
76520,516960,"color changes with lighting, cnd shellac are t...","['color', 'chang', 'light', 'cnd', 'shellac', ...",positive,2
76521,516965,my wife bought it and loves it. she says paint...,"['wife', 'bought', 'love', 'sai', 'paint', 'fa...",positive,2


In [5]:
df['binary_integers'] = np.where(df['target_sentiment_binary']=='positive', 1, 0)

In [6]:
df = df.drop_duplicates(subset=['text_preprocessed_for_sentiment'])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df['text_preprocessed_for_sentiment'], 
                                                    df['binary_integers'], 
                                                    test_size=0.3, random_state=42, stratify=df['binary_integers'])

In [10]:
pipeline_dt = Pipeline([
    ('vect', TfidfVectorizer()),
    ('dt_classifier', DecisionTreeClassifier(random_state=42)),
])

In [11]:
parameters_dt = {   'dt_classifier__max_depth':[5,10,15,20,25],
                    'dt_classifier__min_samples_split':[2,4,6,8],
                    'dt_classifier__max_features':[2,4,6,8]
                }

In [12]:
new_parameters_dt = {   'dt_classifier__max_depth':[5,10,15,20,25]
                }

## Results Decision Tree

In [18]:
dt_grid = make_grid_search(pipeline_dt,parameters_dt)

Performing grid search...
pipeline: ['vect', 'dt_classifier']
parameters:
{'dt_classifier__max_depth': [5, 10, 15, 20, 25],
 'dt_classifier__max_features': [2, 4, 6, 8],
 'dt_classifier__min_samples_split': [2, 4, 6, 8]}
Accuracy: 0.8174143516246196
Recall 0.8174143516246196
Precision 0.8174143516246196
F1-score 0.4797430650969849
ROC_AUC 0.5140134987593559
done in 57.790s

Best score: 0.532
Best parameters set:
	dt_classifier__max_depth: 25
	dt_classifier__max_features: 8
	dt_classifier__min_samples_split: 6
              precision    recall  f1-score   support

           0       0.66      0.03      0.06      3778
           1       0.82      1.00      0.90     16596

    accuracy                           0.82     20374
   macro avg       0.74      0.51      0.48     20374
weighted avg       0.79      0.82      0.74     20374



In [19]:
new_dt_grid = make_grid_search(pipeline_dt,new_parameters_dt)

Performing grid search...
pipeline: ['vect', 'dt_classifier']
parameters:
{'dt_classifier__max_depth': [5, 10, 15, 20, 25]}
Accuracy: 0.8425444193580053
Recall 0.8425444193580053
Precision 0.8425444193580053
F1-score 0.6299925128724067
ROC_AUC 0.6052842240618486
done in 10.995s

Best score: 0.755
Best parameters set:
	dt_classifier__max_depth: 10
              precision    recall  f1-score   support

           0       0.75      0.23      0.35      3778
           1       0.85      0.98      0.91     16596

    accuracy                           0.84     20374
   macro avg       0.80      0.61      0.63     20374
weighted avg       0.83      0.84      0.81     20374



In [29]:
generated = ['iair ir shanpoo and conditioner and ias bleak hair and the shene is worth it. the smell is goedt an']

In [30]:
tfidf = TfidfVectorizer()

In [31]:
input = tfidf.fit_transform(generated)

In [32]:
new_dt_grid.predict(generated)

array([1])

predicted positive sentiment!

In [34]:
pred_seed = ['oo is goeat aod soet and seeled to be without it. toe lanz sooe some boowle oo my hair. the smell is']

In [35]:
input2 = tfidf.fit_transform(pred_seed)

In [36]:
new_dt_grid.predict(pred_seed)

array([1])

In [54]:
max_depths = np.linspace(1, 32, 32, endpoint=True)
train_results = []
test_results = []
for max_depth in max_depths:

    pipeline_dt = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', DecisionTreeClassifier(max_depth= max_depth,random_state=42)),
    ])
    
    
    pipeline_dt.fit(X_train, y_train)
    train_pred = pipeline_dt.predict(X_train)

    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    # Add auc score to previous train results
    train_results.append(roc_auc)
    y_pred = pipeline_dt.predict(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    # Add auc score to previous test results
    test_results.append(roc_auc)
    print('done for max_depth={}. current progress: {} %'.format(max_depth, (max_depth/len(max_depths)*100)))


done for max_depth=1.0. current progress: 3.125 %
done for max_depth=2.0. current progress: 6.25 %
done for max_depth=3.0. current progress: 9.375 %
done for max_depth=4.0. current progress: 12.5 %
done for max_depth=5.0. current progress: 15.625 %
done for max_depth=6.0. current progress: 18.75 %
done for max_depth=7.0. current progress: 21.875 %
done for max_depth=8.0. current progress: 25.0 %
done for max_depth=9.0. current progress: 28.125 %
done for max_depth=10.0. current progress: 31.25 %
done for max_depth=11.0. current progress: 34.375 %
done for max_depth=12.0. current progress: 37.5 %
done for max_depth=13.0. current progress: 40.625 %
done for max_depth=14.0. current progress: 43.75 %
done for max_depth=15.0. current progress: 46.875 %
done for max_depth=16.0. current progress: 50.0 %
done for max_depth=17.0. current progress: 53.125 %
done for max_depth=18.0. current progress: 56.25 %
done for max_depth=19.0. current progress: 59.375 %
done for max_depth=20.0. current prog

In [58]:
# Create traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=max_depths, y=train_results,
                    mode='lines+markers',
                    name='Training AUC'))
fig.add_trace(go.Scatter(x=max_depths, y=test_results,
                    mode='lines+markers',
                    name='Test AUC'))

fig.update_layout(title='Decision tree overfitting / underfitting check for max_depth = range(1,33)', xaxis_title = 'Tree depth', yaxis_title = 'AUC score')

fig.show()

---

In [20]:
pipeline_rf = Pipeline([
    ('vect', TfidfVectorizer()),
    ('rf_classifier', RandomForestClassifier(random_state=42)),
])

In [21]:
parameters_rf = {   'rf_classifier__n_estimators':[25,50,100,200],
                    'rf_classifier__max_features':[2,4,6,8],
                    'rf_classifier__min_samples_split':[2,4,6,8],
                    'rf_classifier__max_depth':[5,10,15,20,25]
                }

In [22]:
rf_grid = make_grid_search(pipeline_rf,parameters_rf)

Performing grid search...
pipeline: ['vect', 'rf_classifier']
parameters:
{'rf_classifier__max_depth': [5, 10, 15, 20, 25],
 'rf_classifier__max_features': [2, 4, 6, 8],
 'rf_classifier__min_samples_split': [2, 4, 6, 8],
 'rf_classifier__n_estimators': [25, 50, 100, 200]}
Accuracy: 0.814567586139197
Recall 0.814567586139197
Precision 0.814567586139197
F1-score 0.4489045171760887
ROC_AUC 0.5
done in 383.450s

Best score: 0.860
Best parameters set:
	rf_classifier__max_depth: 25
	rf_classifier__max_features: 8
	rf_classifier__min_samples_split: 8
	rf_classifier__n_estimators: 200
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      3778
           1       0.81      1.00      0.90     16596

    accuracy                           0.81     20374
   macro avg       0.41      0.50      0.45     20374
weighted avg       0.66      0.81      0.73     20374



In [23]:
newparameters_rf = {    'rf_classifier__n_estimators':[25,50,100,200],
                        'rf_classifier__max_depth':[5,10,15,20,25]
                }

In [24]:
rf_grid_new = make_grid_search(pipeline_rf,newparameters_rf)

Performing grid search...
pipeline: ['vect', 'rf_classifier']
parameters:
{'rf_classifier__max_depth': [5, 10, 15, 20, 25],
 'rf_classifier__n_estimators': [25, 50, 100, 200]}
Accuracy: 0.8150584077746147
Recall 0.8150584077746147
Precision 0.8150584077746147
F1-score 0.45166588954867726
ROC_AUC 0.5013234515616729
done in 50.925s

Best score: 0.891
Best parameters set:
	rf_classifier__max_depth: 25
	rf_classifier__n_estimators: 200
              precision    recall  f1-score   support

           0       1.00      0.00      0.01      3778
           1       0.81      1.00      0.90     16596

    accuracy                           0.82     20374
   macro avg       0.91      0.50      0.45     20374
weighted avg       0.85      0.82      0.73     20374



In [7]:
train_rf = []
test_rf = []

In [None]:
train_rf_n_estimators = []
test_rf_n_estimators = []

max_depths = np.linspace(1, 32, 32, endpoint=True)
for max_depth in max_depths:


    rf = RandomForestClassifier(max_depth=max_depth,random_state=42)
   
    rf.fit(X_train_vect, y_train)
    train_pred = rf.predict(X_train_vect)

    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    # Add auc score to previous train results
    train_rf_n_estimators.append(roc_auc)
    y_pred = rf.predict(X_test_vect)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    # Add auc score to previous test results
    test_rf_n_estimators.append(roc_auc)
    print('done for max_depth={}. current progress: {} %'.format(max_depth, (max_depth/len(max_depths)*100)))

In [29]:
max_depths = np.linspace(1, 32, 32, endpoint=True)
for max_depth in max_depths:

    #pipeline_rf = Pipeline([
    #('vect', TfidfVectorizer()),
    #('rf_classifier', RandomForestClassifier(max_depth=max_depth,random_state=42)),
    #])

    rf = RandomForestClassifier(max_depth=max_depth,random_state=42)
   
    rf.fit(X_train_vect, y_train)
    train_pred = rf.predict(X_train_vect)

    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    # Add auc score to previous train results
    train_rf.append(roc_auc)
    y_pred = rf.predict(X_test_vect)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    # Add auc score to previous test results
    test_rf.append(roc_auc)
    print('done for max_depth={}. current progress: {} %'.format(max_depth, (max_depth/len(max_depths)*100)))

done for max_depth=1.0. current progress: 3.125 %
done for max_depth=2.0. current progress: 6.25 %
done for max_depth=3.0. current progress: 9.375 %
done for max_depth=4.0. current progress: 12.5 %
done for max_depth=5.0. current progress: 15.625 %
done for max_depth=6.0. current progress: 18.75 %
done for max_depth=7.0. current progress: 21.875 %
done for max_depth=8.0. current progress: 25.0 %
done for max_depth=9.0. current progress: 28.125 %
done for max_depth=10.0. current progress: 31.25 %
done for max_depth=11.0. current progress: 34.375 %
done for max_depth=12.0. current progress: 37.5 %
done for max_depth=13.0. current progress: 40.625 %
done for max_depth=14.0. current progress: 43.75 %
done for max_depth=15.0. current progress: 46.875 %
done for max_depth=16.0. current progress: 50.0 %
done for max_depth=17.0. current progress: 53.125 %
done for max_depth=18.0. current progress: 56.25 %
done for max_depth=19.0. current progress: 59.375 %
done for max_depth=20.0. current prog

In [30]:
print(train_rf, test_rf)

[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5000567214974475, 0.500113442994895, 0.5003970504821327, 0.5003970504821327, 0.5013613159387408, 0.5018718094157686, 0.5028927963698242, 0.5040272263187748, 0.5042541123085649, 0.5055019852524107, 0.5070334656834941, 0.5092456040839478, 0.5120249574588769, 0.5144639818491208, 0.5179807146908678, 0.5211003970504822, 0.5246171298922291, 0.5293817356778219, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5000567214974475, 0.500113442994895, 0.5003970504821327, 0.5003970504821327, 0.5013613159387408, 0.5018718094157686, 0.5028927963698242, 0.5040272263187748, 0.5042541123085649, 0.5055019852524107, 0.5070334656834941, 0.5092456040839478, 0.5120249574588769, 0.5144639818491208, 0.5179807146908678, 0.5211003970504822, 0.5246171298922291, 0.5293817356778219] [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5001323451561672, 0.5, 0.50039703

In [31]:
# Create traces
fig4 = go.Figure()
fig4.add_trace(go.Scatter(x=max_depths, y=train_rf,
                    mode='lines+markers',
                    name='Training AUC'))
fig4.add_trace(go.Scatter(x=max_depths, y=test_rf,
                    mode='lines+markers',
                    name='Test AUC'))

fig4.update_layout(title='Random Forest overfitting / underfitting check for max_depth in range(1,33)', xaxis_title = 'Tree depth', yaxis_title = 'AUC score')

fig4.show()

In [63]:
# Create traces
fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=max_depths, y=train_rf,
                    mode='lines+markers',
                    name='Training AUC'))
fig2.add_trace(go.Scatter(x=max_depths, y=test_rf,
                    mode='lines+markers',
                    name='Test AUC'))

fig2.update_layout(title='Random Forest overfitting / underfitting check for max_depth in range(1,33)', xaxis_title = 'Tree depth', yaxis_title = 'AUC score')

fig2.show()