### Import Data

In [38]:
import pandas as pd
nut_rep = pd.read_csv("nutrition_repos_clean_stemmed_lemmatize.csv")
nut_rep["language"].value_counts()

other         52
JavaScript    34
Python        19
Java          12
Name: language, dtype: int64

### Split Data into Train/Test

In [39]:
from split_get_scale import SplitGetScale

sgs = SplitGetScale()
train, test = sgs.split(nut_rep)

In [40]:
train.shape, test.shape

((93, 5), (24, 5))

In [41]:
train.head()

Unnamed: 0,language,readme_contents,clean,clean_stemmed,clean_lemmatized
31,Python,The internet is full of recipes. A recipe is r...,the internet is full of recipes a recipe is re...,internet full recip recip realli onli direct i...,internet full recipe recipe really direction i...
29,other,[![Build Status](https://travis-ci.org/openpan...,build statushttpstravisciorgopenpantryopenpant...,build statushttpstravisciorgopenpantryopenpant...,build statushttpstravisciorgopenpantryopenpant...
107,JavaScript,# NutritionIQ\nNutritionIQ is a machine-learni...,nutritioniq\nnutritioniq is a machinelearning ...,nutritioniq nutritioniq machinelearn base rece...,nutritioniq nutritioniq machinelearning based ...
23,other,"<p align=""center"">\n <a href=""https://calori...",p aligncenter\n a hrefhttpscaloriesincom targe...,p aligncent hrefhttpscaloriesincom targetblank...,p aligncenter hrefhttpscaloriesincom targetbla...
9,other,# NutritionCal\n\nNutrition Diary is a food an...,nutritioncal\n\nnutrition diary is a food and ...,nutritionc nutrit diari food nutrit app help e...,nutritioncal nutrition diary food nutrition ap...


### Split Train/Test into X_train/y_train

In [42]:
(X_train_stemmed, y_train_stemmed), (X_test_stemmed, y_test_stemmed) = sgs.get_Xy(train, test, cols_train="clean_stemmed")
(X_train_lemmed, y_train_lemmed), (X_test_lemmed, y_test_lemmed) = sgs.get_Xy(train, test, cols_train="clean_lemmatized")

### Machine Learning

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

#### Baseline

In [44]:
baseline_prediction = pd.Series(["other"] * len(train["language"]))
baseline_prediction

0     other
1     other
2     other
3     other
4     other
      ...  
88    other
89    other
90    other
91    other
92    other
Length: 93, dtype: object

In [45]:
print(f"Baseline Accuracy: {round(accuracy_score(train['language'], baseline_prediction), 2)}")

Baseline Accuracy: 0.42


#### Modeling

In [46]:
estimator = RandomForestClassifier(random_state=123)
params_d = {"n_estimators": [10, 25, 50, 100], "max_depth": [5, 10, 25, None], "min_samples_split": [2, 4, 6, 8, 10], "min_samples_leaf": [1, 5, 10], "bootstrap": [True, False], "warm_start": [True, False]}

In [47]:
fit_stemmed = RandomizedSearchCV(estimator=estimator, param_distributions=params_d, n_iter=50, random_state=123).fit(X_train_stemmed, y_train_stemmed)
fit_lemmed = RandomizedSearchCV(estimator=estimator, param_distributions=params_d, n_iter=50, random_state=123).fit(X_train_lemmed, y_train_lemmed)

In [48]:
fit_stemmed.best_estimator_, fit_stemmed.best_params_

(RandomForestClassifier(bootstrap=False, max_depth=5, min_samples_leaf=5,
                        min_samples_split=6, n_estimators=10, random_state=123,
                        warm_start=True),
 {'warm_start': True,
  'n_estimators': 10,
  'min_samples_split': 6,
  'min_samples_leaf': 5,
  'max_depth': 5,
  'bootstrap': False})

In [49]:
fit_lemmed.best_estimator_, fit_lemmed.best_params_

(RandomForestClassifier(min_samples_split=4, n_estimators=10, random_state=123,
                        warm_start=True),
 {'warm_start': True,
  'n_estimators': 10,
  'min_samples_split': 4,
  'min_samples_leaf': 1,
  'max_depth': None,
  'bootstrap': True})

In [50]:
rfc_stem =  RandomForestClassifier(max_depth=5, min_samples_leaf=5, min_samples_split=6, n_estimators=10, random_state=123, warm_start=True, bootstrap=False)
rfc_lem =  RandomForestClassifier(min_samples_split=4, n_estimators=10, min_samples_leaf=1, random_state=123, warm_start=True, max_depth=None, bootstrap=True)

rfc_stem.fit(X_train_stemmed, y_train_stemmed)
rfc_lem.fit(X_train_lemmed, y_train_lemmed)

y_pred_train_stem = rfc_stem.predict(X_train_stemmed)
y_pred_train_lem = rfc_lem.predict(X_train_lemmed)

y_pred_test_stem = rfc_stem.predict(X_test_stemmed)
y_pred_test_lem = rfc_lem.predict(X_test_lemmed)

In [51]:
print(f"TRAIN\nRandom Forest Classifier Stemmed Accuracy: {round(accuracy_score(y_train_stemmed, y_pred_train_stem), 2)}")
print(f"Random Forest Classifier Lemmed Accuracy: {round(accuracy_score(y_test_stemmed, y_pred_test_stem), 2)}")


print(f"\nTEST\nRandom Forest Classifier Stemmed Accuracy Test: {round(accuracy_score(y_train_lemmed, y_pred_train_lem), 2)}")
print(f"Random Forest Classifier Lemmed Accuracy Test: {round(accuracy_score(y_test_lemmed, y_pred_test_lem), 2)}")

TRAIN
Random Forest Classifier Stemmed Accuracy: 0.68
Random Forest Classifier Lemmed Accuracy: 0.5

TEST
Random Forest Classifier Stemmed Accuracy Test: 0.98
Random Forest Classifier Lemmed Accuracy Test: 0.5


#### Both beat baseline, but it looks really bad