In [6]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
#Read Data
X_train = pd.read_csv("../X_train.csv")
X_test = pd.read_csv("../X_test.csv")

In [22]:
X_train = X_train.dropna()
X_train.isnull().sum()

tokenized_text    0
mental_state      0
dtype: int64

In [23]:
X_test = X_test.dropna()
X_test.isnull().sum()

tokenized_text    0
mental_state      0
dtype: int64

In [25]:
y_train = X_train['mental_state']
X_train = X_train['tokenized_text']

y_test = X_test['mental_state']
X_test = X_test['tokenized_text']

In [26]:
X_train = X_train.apply(lambda x: x.split())

In [32]:
X_train

0          [im, never, good, enough, thing, im, good, bla...
1          [debussy, clair, de, lune, piano, year, old, e...
2          [listening, africa, toto, pm, real, sad, hours...
3          [years, gone, knowing, denying, depression, i,...
4          [i, pathetic, i, tired, mentally, physically, ...
                                 ...                        
1466239                           [public, transit, stories]
1466240    [point, giving, help, much, hello, i, professo...
1466241    [declaring, love, sex, friend, i, sent, exposi...
1466242    [song, you, stuck, your, head, today, cc, me, ...
1466243    [wrong, way, make, friends, going, rando, libr...
Name: tokenized_text, Length: 1465677, dtype: object

In [34]:
def identity_tokenizer(text):
    return text

tfidf_no_token = TfidfVectorizer(tokenizer=identity_tokenizer,preprocessor=identity_tokenizer,token_pattern=None, max_features = 400)

In [35]:
tfidf_no_token.fit(X_train)
fit_model = tfidf_no_token.transform(X_train)
#test_model = tfidf_no_token.transform(x_test)

In [36]:
X_train = fit_model

## Random Forest Vanilla

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, random_state=0)
rf.fit(X_train, y_train)

In [13]:
predict = rf.predict(X_test)

In [14]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,predict))
print(classification_report(y_test,predict))
print(accuracy_score(y_test, predict))

[[247 251]
 [197 305]]
              precision    recall  f1-score   support

           0       0.56      0.50      0.52       498
           1       0.55      0.61      0.58       502

    accuracy                           0.55      1000
   macro avg       0.55      0.55      0.55      1000
weighted avg       0.55      0.55      0.55      1000

0.552


In [15]:
# code from https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 10, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [10, 31, 52, 73, 94, 115, 136, 157, 178, 200], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 6, 7, 8, 10, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [21]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:    0.6s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    0.7s finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=5,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [5, 6, 7, 8, 10, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [10, 31, 52, 73, 94,
                                                         115, 136, 157, 178,
                                                         200]},
                   random_state=42, verbose=2)

In [22]:
predictions = rf_random.predict(X_test)

In [23]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
predictions = rf_random.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[247 251]
 [198 304]]
              precision    recall  f1-score   support

           0       0.56      0.50      0.52       498
           1       0.55      0.61      0.58       502

    accuracy                           0.55      1000
   macro avg       0.55      0.55      0.55      1000
weighted avg       0.55      0.55      0.55      1000

0.551


### DUMMY

In [8]:
data = np.array([int(x) for x in np.linspace(1, 10, num =1000)])

In [9]:
df = pd.DataFrame(data)
df['wot'] = pd.DataFrame(data)
df['money'] = pd.DataFrame(data)
df['car'] = pd.DataFrame(data)
df['rent'] = pd.DataFrame(data)
df['bots'] = pd.DataFrame(data)
df['label'] = pd.DataFrame(data)

In [10]:
y_train = np.random.randint(2, size=1000)
X_train = df

In [11]:
y_test = y_train
X_test = X_train