**LOGISTIC REGRESSION**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# Load your dataset
df = pd.read_csv('https://raw.githubusercontent.com/AdamMonroUnil/DSML/main/Data/training_data.csv')

# Split the DataFrame into features (X) and target (y)
# Replace 'text_column' and 'difficulty_column' with your actual column names
X = df['sentence']
y = df['difficulty']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a TF-IDF Vectorizer and Logistic Regression model
text_transformer = TfidfVectorizer(ngram_range=(1, 2))
model = LogisticRegression(max_iter=1000)

# Create a pipeline
pipeline = make_pipeline(text_transformer, model)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = pipeline.predict(X_val)

# Calculate metrics
print(f"Accuracy: {accuracy_score(y_val, y_pred):.3f}")
print(f"Precision: {precision_score(y_val, y_pred, average='weighted'):.3f}")
print(f"Recall: {recall_score(y_val, y_pred, average='weighted'):.3f}")
print(f"F1-Score: {f1_score(y_val, y_pred, average='weighted'):.3f}")

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Load new test data
to_predict = pd.read_csv('https://raw.githubusercontent.com/AdamMonroUnil/DSML/main/Data/unlabelled_test_data.csv')

# Predict using the trained model
predicted_difficulties = pipeline.predict(to_predict['sentence'])

# Create a new DataFrame for submission
submission = pd.DataFrame({
    'id': to_predict['id'],  # Replace 'ID' with the actual ID column name if different
    'difficulty': predicted_difficulties
})

# Save the submission DataFrame to a new CSV file
submission.to_csv('submission_LR.csv', index=False)

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,1.6413
1000,1.2081
1500,1.1084
2000,1.0301
2500,0.8699
3000,0.7338
3500,0.6723
4000,0.6778
4500,0.5257
5000,0.3625


**DECISION TREE**

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load your dataset
df = pd.read_csv('https://raw.githubusercontent.com/AdamMonroUnil/DSML/main/Data/training_data.csv')

# Split the DataFrame into features (X) and target (y)
X = df['sentence']
y = df['difficulty']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a TF-IDF Vectorizer and Decision Tree model
text_transformer = TfidfVectorizer(ngram_range=(1, 2))
model = DecisionTreeClassifier()  # You can adjust parameters like max_depth, min_samples_split, etc.

# Create a pipeline
pipeline = make_pipeline(text_transformer, model)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = pipeline.predict(X_val)

# Calculate metrics
print(f"Accuracy: {accuracy_score(y_val, y_pred):.3f}")
print(f"Precision: {precision_score(y_val, y_pred, average='weighted'):.3f}")
print(f"Recall: {recall_score(y_val, y_pred, average='weighted'):.3f}")
print(f"F1-Score: {f1_score(y_val, y_pred, average='weighted'):.3f}")

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Load new test data
to_predict = pd.read_csv('https://raw.githubusercontent.com/AdamMonroUnil/DSML/main/Data/unlabelled_test_data.csv')

# Predict using the trained model
predicted_difficulties = pipeline.predict(to_predict['sentence'])

# Create a new DataFrame for submission
submission = pd.DataFrame({
    'id': to_predict['id'],  # Replace 'ID' with the actual ID column name if different
    'difficulty': predicted_difficulties
})

# Save the submission DataFrame to a new CSV file
submission.to_csv('submission_DT.csv', index=False)


Accuracy: 0.285
Precision: 0.277
Recall: 0.285
F1-Score: 0.274
Confusion Matrix:
[[102  29  18   6   5   6]
 [ 55  38  36  13   6  10]
 [ 47  38  41  14  12  14]
 [ 13  17  40  33  26  24]
 [ 16  20  34  23  31  28]
 [ 18  18  40  28  32  29]]


**kNN**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load your dataset
df = pd.read_csv('https://raw.githubusercontent.com/AdamMonroUnil/DSML/main/Data/training_data.csv')

# Split the DataFrame into features (X) and target (y)
X = df['sentence']
y = df['difficulty']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a TF-IDF Vectorizer and kNN model
text_transformer = TfidfVectorizer(ngram_range=(1, 2))
model = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors

# Create a pipeline
pipeline = make_pipeline(text_transformer, model)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = pipeline.predict(X_val)

# Calculate metrics
print(f"Accuracy: {accuracy_score(y_val, y_pred):.3f}")
print(f"Precision: {precision_score(y_val, y_pred, average='weighted'):.3f}")
print(f"Recall: {recall_score(y_val, y_pred, average='weighted'):.3f}")
print(f"F1-Score: {f1_score(y_val, y_pred, average='weighted'):.3f}")

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Load new test data
to_predict = pd.read_csv('https://raw.githubusercontent.com/AdamMonroUnil/DSML/main/Data/unlabelled_test_data.csv')
# Predict using the trained model
predicted_difficulties = pipeline.predict(to_predict['sentence'])

# Create a new DataFrame for submission
submission = pd.DataFrame({
    'id': to_predict['id'],  # Replace 'ID' with the actual ID column name if different
    'difficulty': predicted_difficulties
})

# Save the submission DataFrame to a new CSV file
submission.to_csv('submission_kNN.csv', index=False)


**RANDOM FOREST**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


# Load your dataset
df = pd.read_csv('https://raw.githubusercontent.com/AdamMonroUnil/DSML/main/training_data.csv')

# Split the DataFrame into features (X) and target (y)
X = df['sentence']
y = df['difficulty']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a TF-IDF Vectorizer
text_transformer = TfidfVectorizer(ngram_range=(1, 2))

# Using Random Forest Classifier
model = RandomForestClassifier()

# Create a pipeline
pipeline = make_pipeline(text_transformer, model)

# GridSearchCV settings for hyperparameter tuning
param_grid = {
    'randomforestclassifier__n_estimators': [100, 200],
    'randomforestclassifier__max_depth': [None, 10, 20]
}

# Using GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='precision_weighted')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Predictions and metrics on validation set
y_pred = grid_search.predict(X_val)

# Calculate metrics
print(f"Accuracy: {accuracy_score(y_val, y_pred):.3f}")
print(f"Precision: {precision_score(y_val, y_pred, average='weighted'):.3f}")
print(f"Recall: {recall_score(y_val, y_pred, average='weighted'):.3f}")
print(f"F1-Score: {f1_score(y_val, y_pred, average='weighted'):.3f}")

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))


# Load new test data
to_predict = pd.read_csv('https://raw.githubusercontent.com/AdamMonroUnil/DSML/main/unlabelled_test_data.csv')

# Predict using the trained model
predicted_difficulties = grid_search.predict(to_predict['sentence'])

# Create a new DataFrame for submission
submission2 = pd.DataFrame({
    'id': to_predict['id'],
    'difficulty': predicted_difficulties
})

# Save the submission DataFrame to a new CSV file
submission2.to_csv('submission_RF.csv', index=False)
