In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from google.colab import files
from sklearn.metrics import precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier


# Loading the sample, training, and test datasets
sample = pd.read_csv("https://raw.githubusercontent.com/Bratuz/Jaeger/master/detecting-french-texts-difficulty-level-2023/Data/sample_submission.csv")
train_dataset = pd.read_csv("https://raw.githubusercontent.com/Bratuz/Jaeger/master/detecting-french-texts-difficulty-level-2023/Data/training_data.csv")
test_dataset = pd.read_csv("https://raw.githubusercontent.com/Bratuz/Jaeger/master/detecting-french-texts-difficulty-level-2023/Data/unlabelled_test_data.csv")

In [2]:
# Extracting features (X) and target variable (y) from the training dataset
X = train_dataset["sentence"]
y = train_dataset["difficulty"]

In [3]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Using TF-IDF to represent the text
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [5]:
%%capture
# The following block initializes, trains, and evaluates a Random Forest model.

# Initialize the Random Forest model with the specified number of estimators
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust the number of estimators as per your choice

# Train the Random Forest model on the training set
rf_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set using the trained model
predictions_rf = rf_model.predict(X_test_tfidf)

# Evaluate the performance of the Random Forest model
accuracy_rf = accuracy_score(y_test, predictions_rf)
precision_rf, recall_rf, f1_rf, _ = precision_recall_fscore_support(y_test, predictions_rf, average='weighted')

In [6]:
# Displaying results
print(f"Accuracy (Random Forest): {accuracy_rf}")
print(f"Precision: {precision_rf}")
print(f"Recall: {recall_rf}")
print(f"F1 Score: {f1_rf}")

Accuracy (Random Forest): 0.39166666666666666
Precision: 0.39660960624031383
Recall: 0.39166666666666666
F1 Score: 0.37769086230428955


In [7]:
# Loading the test dataset for final predictions
X_test_final = test_dataset["sentence"]

# Using TF-IDF to represent the text of the test set
X_test_final_tfidf = tfidf_vectorizer.transform(X_test_final)

# Making predictions on the final test set with the trained Random Forest model
predictions_test_final = rf_model.predict(X_test_final_tfidf)

# Creating a DataFrame with test set predictions
result_df = pd.DataFrame({"id": test_dataset["id"], "difficulty": predictions_test_final})

# Saving the predictions to a CSV file
result_df.to_csv("predictions_test.csv", index=False)

In [8]:
# Downloading the CSV file containing the final predictions
files.download("predictions_test.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>