<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Project 3: NPL on Intermittent Fasting and Keto Diet

---

# Part 3: Modelling

In [1]:
# Import libaries
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, ConfusionMatrixDisplay, f1_score

import warnings
warnings.simplefilter("ignore")

In [2]:
# We are dealing with large data sets, so setting max number of column and row displays to be unlimited
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
# Reading in the cleaned data
df = pd.read_csv('data/subreddit_cleaned.csv')

In [4]:
df.head()

Unnamed: 0,title,selftext,subreddit,created_utc,post_word_count,post_length,clean_text
0,Plateau sruggles,I (27F) have been intermittent fasting for abo...,0,1625589667,123,660,27f intermittent fasting 2 months starting wei...
1,Can I still do IF/OMAD now that I started exer...,"I started lifting 4x/week (about 40 minutes), ...",0,1625586042,127,679,started lifting 4xweek 40 minutes well taking ...
2,A new mindset,"Hello everyone,\n\n I am a mostly lurker here...",0,1625584307,182,929,hello everyone mostly lurker reddit first id l...
3,Weekend habits are making it difficult to loos...,"Hi everyone,\n\nI have been doing IF (16:8) fo...",0,1625582039,110,569,hi everyone 168 almost 3 years remained consis...
4,Are these times acceptable for IF?,"So, due to loss of employment, family has take...",0,1625582007,167,806,due loss employment family taken bother trying...


In [5]:
# Function for lemmatizing
def lemmatize_text(text):

    # split into words
    split_text = text.split()

    # instantiate lemmatizer
    lemmatizer = WordNetLemmatizer()

    # lemmatize and rejoin
    return ' '.join([lemmatizer.lemmatize(word) for word in split_text])

In [6]:
df['cleantext_lemm'] = df['clean_text'].apply(lemmatize_text)

In [7]:
df.head()

Unnamed: 0,title,selftext,subreddit,created_utc,post_word_count,post_length,clean_text,cleantext_lemm
0,Plateau sruggles,I (27F) have been intermittent fasting for abo...,0,1625589667,123,660,27f intermittent fasting 2 months starting wei...,27f intermittent fasting 2 month starting weig...
1,Can I still do IF/OMAD now that I started exer...,"I started lifting 4x/week (about 40 minutes), ...",0,1625586042,127,679,started lifting 4xweek 40 minutes well taking ...,started lifting 4xweek 40 minute well taking l...
2,A new mindset,"Hello everyone,\n\n I am a mostly lurker here...",0,1625584307,182,929,hello everyone mostly lurker reddit first id l...,hello everyone mostly lurker reddit first id l...
3,Weekend habits are making it difficult to loos...,"Hi everyone,\n\nI have been doing IF (16:8) fo...",0,1625582039,110,569,hi everyone 168 almost 3 years remained consis...,hi everyone 168 almost 3 year remained consist...
4,Are these times acceptable for IF?,"So, due to loss of employment, family has take...",0,1625582007,167,806,due loss employment family taken bother trying...,due loss employment family taken bother trying...


## Baseline Model

In [8]:
df['subreddit'].value_counts(normalize = True)

1    0.544412
0    0.455588
Name: subreddit, dtype: float64

Given that we have quite balanced data between both classes, our baseline model accuracy is the probability from our target subreddit -- Keto diet. Our baseline accuracy is 54.4%. Hopefully we can models that score better than this score.

## Modelling

In [9]:
X = df['cleantext_lemm']
y = df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

In [10]:
print(y_train.shape)
print(y_test.shape)

(7717,)
(2573,)


In [11]:
# Vectorizer parameters:

cvec_params = {
    'cvec__max_features': [None, 5_000],
    'cvec__max_df': [0.5, 0.9],
    'cvec__ngram_range':[(1,1), (1,2)],
}

tvec_params = {
    'tvec__max_features': [None, 5_000],
    'tvec__max_df': [0.5, 0.9],
    'tvec__ngram_range':[(1,1), (1,2)]
}

In [12]:
# Model parameters:

lr_params = {
    'lr__penalty':['l1','l2'],
    'lr__C':[0.1, 1, 10]
}

knn_params = {'knn__n_neighbors': [3, 5, 7],
              'knn__weights': ['uniform', 'distance']}

nb_params = {
    'nb__alpha': [0.1, 1, 10]
}

rf_params = {
    'rf__n_estimators': [100, 150],
    'rf__max_depth': [None, 5],
    'rf__min_samples_leaf': [1, 5]
}

ada_params = {
    'ada__n_estimators': [50, 100],
    'ada__learning_rate': [0.5, 1.0]
}

gb_params = {
    'gb__max_depth': [3, 4],
    'gb__n_estimators': [100, 200],
    'gb__learning_rate': [0.5, 1.0]
}

In [13]:
def model(vec_inst, vectorizer, mod_inst, model, vec_params, mod_params):
    pipe = Pipeline([
        (vec_inst, vectorizer),
        (mod_inst, model)])
    
    gs = GridSearchCV(pipe, param_grid = {**vec_params, **mod_params})
    gs.fit(X_train, y_train)
    
    print('*'*80)
    print(f'MODEL = {model}, VECTORIZER = {vectorizer}')
    print('*'*80)
    print(f'Best fitting parameters: {gs.best_params_}\n')
    print(f'Best score: {round(gs.best_score_, 3)}')
    print(f'Test score: {round(gs.score(X_test, y_test), 3)}\n')
    
    # Get predictions
    preds = gs.predict(X_test)
    
    print('Classification report:')
    print(classification_report(y_test, preds))
    
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
    plot_confusion_matrix(gs, X_test, y_test, cmap='Blues', values_format='d');
    
    # Save best model results to Dataframe
    df = pd.DataFrame()
    df['model_vec'] = [f'{mod_inst}_{vec_inst}']
    df['best_params'] = [gs.best_params_]
    df['train_score'] = gs.best_score_
    df['test_score'] = gs.score(X_test, y_test)
    df['sensitivity'] =  tp / (tp + fn)
    df['specificity'] = tn / (tn + fp)
    df['precision'] = tp / (tp + fp)
    df['f1_score'] = f1_score(y_test, preds)
    df['tn'] = tn
    df['fp'] = fp
    df['fn'] = fn
    df['tp'] = tp
    
    return df

In [14]:
df_all = []

In [None]:
lr_cvec = model('cvec', CountVectorizer(), 'lr', LogisticRegression(), cvec_params, lr_params)
lr_cvec.head().T

df_all.append(lr_cvec)

In [None]:
lr_tvec = model('tvec', TfidfVectorizer(), 'lr', LogisticRegression(), tvec_params, lr_params)
lr_tvec.head().T

df_all.append(lr_tvec)

In [None]:
knn_tvec = model('tvec', TfidfVectorizer(), 'knn', KNeighborsClassifier(), tvec_params, knn_params)
knn_tvec.head().T

df_all.append(knn_tvec)

In [None]:
knn_tvec = model('tvec', TfidfVectorizer(), 'knn', KNeighborsClassifier(), tvec_params, knn_params)
knn_tvec.head().T

df_all.append(knn_tvec)

In [None]:
nb_tvec = model('tvec', TfidfVectorizer(), 'nb', MultinomialNB(), tvec_params, nb_params)
nb_tvec.head().T

df_all.append(nb_tvec)

In [None]:
nb_tvec = model('tvec', TfidfVectorizer(), 'nb', MultinomialNB(), tvec_params, nb_params)
nb_tvec.head().T

df_all.append(nb_tvec)

In [None]:
rf_tvec = model('tvec', TfidfVectorizer(), 'rf', RandomForestClassifier(), tvec_params, rf_params)
rf_tvec.head().T

df_all.append(rf_tvec)

In [None]:
rf_tvec = model('tvec', TfidfVectorizer(), 'rf', RandomForestClassifier(), tvec_params, rf_params)
rf_tvec.head().T

df_all.append(rf_tvec)

In [None]:
ada_tvec = model('tvec', TfidfVectorizer(), 'ada', AdaBoostClassifier(), tvec_params, ada_params)
ada_tvec.head().T

df_all.append(ada_tvec)

In [None]:
ada_tvec = model('tvec', TfidfVectorizer(), 'ada', AdaBoostClassifier(), tvec_params, ada_params)
ada_tvec.head().T

df_all.append(ada_tvec)

In [None]:
gb_tvec = model('tvec', TfidfVectorizer(), 'gb', GradientBoostingClassifier(), tvec_params, gb_params)
gb_tvec.head().T

df_all.append(gb_tvec)

In [None]:
gb_tvec = model('tvec', TfidfVectorizer(), 'gb', GradientBoostingClassifier(), tvec_params, gb_params)
gb_tvec.head().T

df_all.append(gb_tvec)

In [None]:
full_df = pd.concat(df_all)

In [None]:
pd.set_option('display.max_colwidth', None)
full_df.sort_values(by=['test_score'], ascending=False).reset_index(drop=True)

# Conclusion
My impression after reading fews posts that contain words of 'keto' and 'interminttent fast' is that many posters are there (on reddit) to seek for advices or suggestions to help on their process on interminttent fasting of keto. Few people share their sucess and health improvement during the process while some other people actually face health issue during the process of weight loss.