<a href="https://colab.research.google.com/github/Adrian-Muino/DMML2022_Geneva/blob/main/Mandatory_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Installation
!pip install sentence-transformers
!python -m spacy download fr_core_news_sm
!python -m spacy link fr_core_news_sm fr
!pip install tensorflow_hub
!pip install tensorflow_text
! pip install kaggle
!mkdir ~/.kaggle
#read in your Kaggle credentials from Google Drive
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json
# download the dataset from the competition page
! kaggle competitions download -c detecting-french-texts-difficulty-level-2022
!unzip detecting-french-texts-difficulty-level-2022.zip

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 4.7 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 33.4 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 60.2 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 76.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 60.8 MB/s 
Building wheels for collected pa

In [4]:
import string
import re
from dmml_geneva_function import *
import pandas as pd

import spacy
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
nltk.download('punkt')

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, RidgeClassifier, Perceptron
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.utils.multiclass import unique_labels
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from google.colab import drive
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import cross_val_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# reading in the data via the Kaggle API & mount your Google Drive
drive.mount('/content/drive', force_remount=True)
df = df_train = pd.read_csv("training_data.csv")

# Baseline

In [None]:
# Base line calculation
difficulty_A1_count = df.loc[df["difficulty"] == "A1"].shape[0]
difficulty_A2_count = df.loc[df["difficulty"] == "A2"].shape[0]
difficulty_B1_count = df.loc[df["difficulty"] == "B1"].shape[0]
difficulty_B2_count = df.loc[df["difficulty"] == "B2"].shape[0]
difficulty_C1_count = df.loc[df["difficulty"] == "C1"].shape[0]
difficulty_C2_count = df.loc[df["difficulty"] == "C2"].shape[0]
baserate = max(difficulty_A1_count, difficulty_A2_count,difficulty_B1_count,difficulty_B2_count,difficulty_C1_count,difficulty_C2_count)/(df["difficulty"].shape[0])
print("Baserate = ", baserate)

# Logistic Regression

In [None]:
tfidf_vector_spacy = TfidfVectorizer(tokenizer=spacy_tokenizer_sm)

In [None]:
X = df["sentence"] # the features we want to analyze
y = df["difficulty"] # the labels we want to test against

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [None]:
# Define classifier
LR_spacy_model= LogisticRegression(solver = "lbfgs", multi_class = 'multinomial')

# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
LR_spacy_pipe = Pipeline([('vectorizer', tfidf_vector_spacy), ('classifier', LR_spacy_model)])

# Fit model on training set
LR_spacy_pipe.fit(X_train, y_train)

LR_spacy_pred = LR_spacy_pipe.predict(X_test)

LR_spacy_report = evaluate(y_test, LR_spacy_pred)

# Storing the model performance results in a DF called reports
reports['Logistic Regression Spacy'] = LR_spacy_report

LR_spacy_report

In [None]:
plot_confusion_matrix(y_test, LR_spacy_pred, LR_spacy_pipe)

In [None]:
badly_predicted = pd.DataFrame({'sentence':X_test[LR_spacy_pred != y_test],
              'predicted':LR_spacy_pred[LR_spacy_pred != y_test],
              'true':y_test[LR_spacy_pred != y_test]})


for i, row in badly_predicted.sample(3).iterrows():
    print(row.sentence)
    print(f"Predicted: {row.predicted}")
    print(f"Actual: {row.true}")

#K-nearest neighbors algorithm

In [None]:
# Define classifier
knn_spacy_model = KNeighborsClassifier()
Nknn = list(range(1, 100))
param_grid = dict(n_neighbors=Nknn)

# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.

knn_spacy_grid = GridSearchCV(knn_spacy_model, param_grid, cv=5, scoring='accuracy', return_train_score=False,verbose=1)

knn_spacy_pipe = Pipeline([('vectorizer',  tfidf_vector_spacy), ('classifier', knn_spacy_grid)])
# Fit model on training set
knn_spacy_pipe.fit(X_train, y_train)

best_param_knn_spacy = knn_spacy_grid .best_params_.get('n_neighbors')
print(knn_spacy_grid .best_params_)

In [None]:
knn_spacy_classifier = KNeighborsClassifier(n_neighbors=best_param_knn_spacy)
# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
KNN_spacy_pipe = Pipeline([('vectorizer', tfidf_vector_spacy), ('classifier', knn_spacy_classifier)])

# Fit model on training set
KNN_spacy_pipe.fit(X_train, y_train)

# Predictions
KNN_spacy_pred = KNN_spacy_pipe.predict(X_test)

KNN_spacy_pred_report = evaluate(y_test, KNN_spacy_pred)

# Storing the model performance results in a DF called reports
reports['KNN Spacy'] = KNN_spacy_pred_report

KNN_spacy_pred_report

In [None]:
plot_confusion_matrix(y_test, KNN_spacy_pred, KNN_spacy_pipe)

#Decision tree

In [None]:
# function for fitting trees of various depths on the training data using cross-validation
def run_cross_validation_on_trees(X, y, tree_depths, cv=5, scoring='accuracy'):
    cv_scores_list = []
    cv_scores_std = []
    cv_scores_mean = []
    accuracy_scores = []
    for depth in tree_depths:
        tree_model = DecisionTreeClassifier(max_depth=depth)
        cv_scores = cross_val_score(tree_model, X_train, y_train, cv=cv, scoring=scoring)
        cv_scores_list.append(cv_scores)
        cv_scores_mean.append(cv_scores.mean())
        cv_scores_std.append(cv_scores.std())
        accuracy_scores.append(tree_model.fit(X_train, y_train).score(X_test, y_test))
    cv_scores_mean = np.array(cv_scores_mean)
    cv_scores_std = np.array(cv_scores_std)
    accuracy_scores = np.array(accuracy_scores)
    return cv_scores_mean, cv_scores_std, accuracy_scores

def plot_cross_validation_on_trees(depths, cv_scores_mean, cv_scores_std, accuracy_scores, title):
    fig, ax = plt.subplots(1,1, figsize=(15,5))
    ax.plot(depths, cv_scores_mean, '-o', label='mean cross-validation accuracy', alpha=0.9)
    ax.fill_between(depths, cv_scores_mean-2*cv_scores_std, cv_scores_mean+2*cv_scores_std, alpha=0.2)
    ylim = plt.ylim()
    ax.plot(depths, accuracy_scores, '-*', label='train accuracy', alpha=0.9)
    ax.set_title(title, fontsize=16)
    ax.set_xlabel('Tree depth', fontsize=14)
    ax.set_ylabel('Accuracy', fontsize=14)
    ax.set_ylim(ylim)
    ax.set_xticks(depths)
    ax.legend()

numberoftry = range(1,26)
sm_mean, sm_std, sm_scores = run_cross_validation_on_trees(X_train, y_train, numberoftry)

# plotting accuracy
plot_cross_validation_on_trees(numberoftry, sm_mean, sm_std, sm_scores, 
                               'Accuracy per decision on training data')

In [None]:
tree_spacy_model = DecisionTreeClassifier()

tree_spacy_pipe = Pipeline([('vectorizer', tfidf_vector_spacy),
                 ('classifier', tree_spacy_model)])


tree_spacy_pipe.fit(X_train, y_train)

tree_spacy_pred = tree_spacy_pipe.predict(X_test)

tree_spacy_report = evaluate(y_test, tree_spacy_pred)

# Store model performance results
reports['Decision Tree Spacy'] = tree_spacy_report

tree_spacy_report

In [None]:
plot_confusion_matrix(y_test, tree_spacy_pred, tree_spacy_pipe)

#Random Forest

In [None]:
forest_spacy_model = RandomForestClassifier()

forest_spacy_pipe = Pipeline([('vectorizer', tfidf_vector_spacy),
                 ('classifier', forest_spacy_model)])

forest_spacy_pipe.fit(X_train, y_train)

forest_spacy_pred = forest_spacy_pipe.predict(X_test)

forest_spacy_report = evaluate(y_test, forest_spacy_pred)

# Store model performance results
reports['Random Forest Spacy'] = forest_spacy_report

forest_spacy_report

In [None]:
plot_confusion_matrix(y_test, forest_spacy_pred, forest_spacy_pipe)