In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from google.colab import files
from sklearn.metrics import precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier


# Loading the sample, training, and test datasets
sample = pd.read_csv("https://raw.githubusercontent.com/Bratuz/Jaeger/master/detecting-french-texts-difficulty-level-2023/Data/sample_submission.csv")
train_dataset = pd.read_csv("https://raw.githubusercontent.com/Bratuz/Jaeger/master/detecting-french-texts-difficulty-level-2023/Data/training_data.csv")
test_dataset = pd.read_csv("https://raw.githubusercontent.com/Bratuz/Jaeger/master/detecting-french-texts-difficulty-level-2023/Data/unlabelled_test_data.csv")

In [None]:
# Extracting features (X) and target variable (y) from the training dataset
X = train_dataset["sentence"]
y = train_dataset["difficulty"]

In [None]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Using TF-IDF to represent the text
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
%%capture
# The following block initializes, trains, and evaluates a Support Vector Classifier (SVC) model.

# Initialize the SVC model with the specified linear kernel
svc_model = SVC(kernel='linear', random_state=42)  # You can adjust the kernel as per your choice (e.g., 'linear', 'rbf', etc.)

# Train the SVC model on the training set
svc_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set using the trained model
predictions_svc = svc_model.predict(X_test_tfidf)

# Evaluate the performance of the SVC model
accuracy_svc = accuracy_score(y_test, predictions_svc)
precision_svc, recall_svc, f1_svc, _ = precision_recall_fscore_support(y_test, predictions_svc, average='weighted')

In [None]:
# Displaying results
print(f"Accuracy (SVC): {accuracy_svc}")
print(f"Precision: {precision_svc}")
print(f"Recall: {recall_svc}")
print(f"F1 Score: {f1_svc}")

Accuracy (SVC): 0.4552083333333333
Precision: 0.4508158506257017
Recall: 0.4552083333333333
F1 Score: 0.4509219661114091


In [None]:
# Loading the test dataset for final predictions
X_test_final = test_dataset["sentence"]

# Using TF-IDF to represent the text of the test set
X_test_final_tfidf = tfidf_vectorizer.transform(X_test_final)

# Making predictions on the final test set with the trained SVC model
predictions_test_final = svc_model.predict(X_test_final_tfidf)

# Creating a DataFrame with test set predictions
result_df = pd.DataFrame({"id": test_dataset["id"], "difficulty": predictions_test_final})

# Saving the predictions to a CSV file
result_df.to_csv("predictions_test.csv", index=False)

In [None]:
# Downloading the CSV file containing the final predictions
files.download("predictions_test.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>