## Project UNIL_TUDOR

The first thing we do is to set the various parameters to start our project

In [None]:
# mount your Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# install Kaggle
! pip install kaggle



In [None]:
!mkdir ~/.kaggle

In [None]:
#read in your Kaggle credentials from Google Drive
!cp /content/drive/MyDrive/Kaggle/kaggle.json ~/.kaggle/kaggle.json


In [None]:
! kaggle competitions download -c detecting-french-texts-difficulty-level-2023
from zipfile import ZipFile
with ZipFile('detecting-french-texts-difficulty-level-2023.zip','r') as zip:
  zip.extractall(path="")

Downloading detecting-french-texts-difficulty-level-2023.zip to /content
  0% 0.00/303k [00:00<?, ?B/s]
100% 303k/303k [00:00<00:00, 118MB/s]


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import  TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import spacy
from spacy import displacy
from sklearn.base import TransformerMixin
import seaborn as sns

We import the data from Kaggle

In [None]:
df_training = pd.read_csv('training_data.csv')
df_test = pd.read_csv('unlabelled_test_data.csv')
df_sample = pd.read_csv('sample_submission.csv')

We use a vectorizer like TfidfVectorizer to convert texts into numerical vectors.

In [None]:
tfidf_vector = TfidfVectorizer()

We assign the sentences from our dataset to X, and their corresponding difficulty levels to Y.

In [None]:
X = df_training['sentence']
y = df_training['difficulty']

After doing our research on the internet, we found that the best train-test ratio is 80/20. Others also claim that 70/30 can also work, but we prefer the 80/20 ratio.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

##We have seen that methods such as KNN, Logistic Regression, Decision Tree or Random Forrest Tree are suggested. We'll first start with the KNN method.

## 1. KNN Model (K-Nearest Neigbors)

Develop a KNN (K-Nearest Neighbors) model by using a TFIDF (Term Frequency-Inverse Document Frequency) vectorization approach.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Creating an instance of TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Creating an instance of KNeighborsClassifier with explicit default parameters
knn_classifier = KNeighborsClassifier(n_neighbors=5, algorithm='auto', metric='minkowski', p=2)

# Assembling the vectorizer and classifier in a pipeline
knn_pipeline = Pipeline([
    ('tfidf_vectorizer', tfidf_vectorizer),
    ('knn_classifier', knn_classifier)
])

# Training the model on the training set
knn_pipeline.fit(X_train, y_train)

As requested in the instructions, we display the Accuracy, Precision, Recall and F1 score.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

predicted_labels_knn = knn_pipeline.predict(X_test)

accuracy = accuracy_score(y_test, predicted_labels_knn)
print("Accuracy:", accuracy)

precision = precision_score(y_test, predicted_labels_knn, average='weighted')
print("Precision:", precision)

recall = recall_score(y_test, predicted_labels_knn, average='weighted')
print("Recall:", recall)

f1 = f1_score(y_test, predicted_labels_knn, average='weighted')
print("F1 Score:", f1)

Accuracy: 0.31979166666666664
Precision: 0.40447154448902534
Recall: 0.31979166666666664
F1 Score: 0.3028669637117734


 Here, there are many methods for improving classification reports, but we've chosen the Hyperparameter optimization with GridSearchCV method because it will help us to find the best combination of parameters to improve model performance.

In [None]:
from sklearn.model_selection import GridSearchCV

hyperparameters = {
    'n_neighbors': np.arange(1, 10),
    'p': np.arange(1, 3),
    'weights': ['uniform', 'distance']
}

knn_grid_search = GridSearchCV(KNeighborsClassifier(), hyperparameters, cv=5, scoring='accuracy', n_jobs=-1)
pipeline_knn_grid_search = Pipeline([
    ('tfidf_vectorizer', tfidf_vector),
    ('knn_grid_search', knn_grid_search)
])

pipeline_knn_grid_search.fit(X_train, y_train)
print('Best parameters found:', knn_grid_search.best_params_)

Best parameters found: {'n_neighbors': 4, 'p': 2, 'weights': 'distance'}


With the help of the Hyperparameter optimization, we'll try again to see if our results are more accurate.

In [None]:
optimized_knn_classifier = KNeighborsClassifier(n_neighbors=4, p=2, weights='distance', algorithm='auto')
optimized_knn_pipeline = Pipeline([
    ('tfidf_vectorizer', tfidf_vector),
    ('knn_classifier', optimized_knn_classifier)
])

optimized_knn_pipeline.fit(X_train, y_train)

We can see that the accuracy, precision, recall and f1 score are better than before.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

optimized_predicted_labels_knn = optimized_knn_pipeline.predict(X_test)

optimized_accuracy = accuracy_score(y_test, optimized_predicted_labels_knn)
print("Accuracy:", optimized_accuracy)

optimized_precision = precision_score(y_test, optimized_predicted_labels_knn, average='weighted')
print("Precision:", optimized_precision)

optimized_recall = recall_score(y_test, optimized_predicted_labels_knn, average='weighted')
print("Recall:", optimized_recall)

optimized_f1 = f1_score(y_test, optimized_predicted_labels_knn, average='weighted')
print("F1 Score:", optimized_f1)

Accuracy: 0.36770833333333336
Precision: 0.4241811079766578
Recall: 0.36770833333333336
F1 Score: 0.3570560404187049


# 2. Decision Tree Classifier


Develop a Decision Tree Classifier Model by Using a TFIDF (Term Frequency-Inverse Document Frequency) vectorization approach.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

# Creating an instance of TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Creating a DecisionTreeClassifier instance with a fixed random state
decision_tree_classifier = DecisionTreeClassifier(random_state=0)

# Assembling the TF-IDF vectorizer and classifier in a pipeline
decision_tree_pipeline = Pipeline([
    ('tfidf_vectorizer', tfidf_vectorizer),
    ('decision_tree_classifier', decision_tree_classifier)
])

# Training the pipeline on the training set
decision_tree_pipeline.fit(X_train, y_train)

As requested in the instructions, we display the Accuracy, Precision, Recall and F1 score.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

predicted_labels_decision_tree = decision_tree_pipeline.predict(X_test)

accuracy_decision_tree = accuracy_score(y_test, predicted_labels_decision_tree)
print("Accuracy:", accuracy_decision_tree)

precision_decision_tree = precision_score(y_test, predicted_labels_decision_tree, average='weighted')
print("Precision:", precision_decision_tree)

recall_decision_tree = recall_score(y_test, predicted_labels_decision_tree, average='weighted')
print("Recall:", recall_decision_tree)

f1_decision_tree = f1_score(y_test, predicted_labels_decision_tree, average='weighted')
print("F1 Score:", f1_decision_tree)

Accuracy: 0.296875
Precision: 0.3003578282448995
Recall: 0.296875
F1 Score: 0.2952457659392269


We improve the model by using Hyperparameter optimization with GridSearchCV



In [None]:
from sklearn.model_selection import GridSearchCV

depth_grid = {'max_depth': np.arange(5, 25)}
decision_tree_grid_search = GridSearchCV(DecisionTreeClassifier(), depth_grid, cv=5)

decision_tree_pipeline = Pipeline([
    ('tfidf_vectorizer', TfidfVectorizer()),
    ('grid_search', decision_tree_grid_search)
])

decision_tree_pipeline.fit(X_train, y_train)
predictions = decision_tree_pipeline.predict(X_test)

print("Best parameters found:", decision_tree_grid_search.best_params_)

Best parameters found: {'max_depth': 22}


With the help of the Hyperparameter optimization, we'll try again to see if our results are more accurate.

In [None]:
optimized_decision_tree = DecisionTreeClassifier(max_depth=20, random_state=0)
optimized_decision_tree_pipeline = Pipeline([
    ('vectorizer', tfidf_vector),
    ('classifier', optimized_decision_tree)
])

optimized_decision_tree_pipeline.fit(X_train, y_train)


We can see that the Accuracy, Precision and Recall are better than before, except F1 score.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

optimised_predicted_labels_decision_tree = optimized_decision_tree_pipeline.predict(X_test)

optimised_accuracy_decision_tree = accuracy_score(y_test, optimised_predicted_labels_decision_tree)
print("Accuracy:", optimised_accuracy_decision_tree)

optimised_precision_decision_tree = precision_score(y_test, optimised_predicted_labels_decision_tree, average='weighted')
print("Precision:", optimised_precision_decision_tree)

optimised_recall_decision_tree = recall_score(y_test, optimised_predicted_labels_decision_tree, average='weighted')
print("Recall:", optimised_recall_decision_tree)

optimised_f1_decision_tree = f1_score(y_test, optimised_predicted_labels_decision_tree, average='weighted')
print("F1 Score:", optimised_f1_decision_tree)

Accuracy: 0.30625
Precision: 0.3025365088816792
Recall: 0.30625
F1 Score: 0.2935570628931272


In [None]:
! kaggle competitions submit -c detecting-french-texts-difficulty-level-2023 -f submission.csv -m "Sample submission

/bin/bash: -c: line 1: unexpected EOF while looking for matching `"'
/bin/bash: -c: line 2: syntax error: unexpected end of file
