**LOGISTIC REGRESSION**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# Load your dataset
df = pd.read_csv('https://raw.githubusercontent.com/AdamMonroUnil/DSML/main/Data/training_data.csv')

# Split the DataFrame into features (X) and target (y)
# Replace 'text_column' and 'difficulty_column' with your actual column names
X = df['sentence']
y = df['difficulty']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a TF-IDF Vectorizer and Logistic Regression model
text_transformer = TfidfVectorizer(ngram_range=(1, 2))
model = LogisticRegression(max_iter=1000)

# Create a pipeline
pipeline = make_pipeline(text_transformer, model)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = pipeline.predict(X_val)

# Calculate metrics
print(f"Accuracy: {accuracy_score(y_val, y_pred):.3f}")
print(f"Precision: {precision_score(y_val, y_pred, average='weighted'):.3f}")
print(f"Recall: {recall_score(y_val, y_pred, average='weighted'):.3f}")
print(f"F1-Score: {f1_score(y_val, y_pred, average='weighted'):.3f}")

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Load new test data
to_predict = pd.read_csv('https://raw.githubusercontent.com/AdamMonroUnil/DSML/main/Data/unlabelled_test_data.csv')

# Predict using the trained model
predicted_difficulties = pipeline.predict(to_predict['sentence'])

# Create a new DataFrame for submission
submission = pd.DataFrame({
    'id': to_predict['id'],  # Replace 'ID' with the actual ID column name if different
    'difficulty': predicted_difficulties
})

# Save the submission DataFrame to a new CSV file
submission.to_csv('submission_LR.csv', index=False)

Accuracy: 0.435
Precision: 0.435
Recall: 0.435
F1-Score: 0.424
Confusion Matrix:
[[101  34  15   7   3   6]
 [ 45  51  35   6   9  12]
 [ 30  47  51   3   7  28]
 [  8   5  11  45  36  48]
 [  5   3   8  18  56  62]
 [  5   3   6  14  23 114]]


**DECISION TREE**

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load your dataset
df = pd.read_csv('https://raw.githubusercontent.com/AdamMonroUnil/DSML/main/Data/training_data.csv')

# Split the DataFrame into features (X) and target (y)
X = df['sentence']
y = df['difficulty']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a TF-IDF Vectorizer and Decision Tree model
text_transformer = TfidfVectorizer(ngram_range=(1, 2))
model = DecisionTreeClassifier()  # You can adjust parameters like max_depth, min_samples_split, etc.

# Create a pipeline
pipeline = make_pipeline(text_transformer, model)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = pipeline.predict(X_val)

# Calculate metrics
print(f"Accuracy: {accuracy_score(y_val, y_pred):.3f}")
print(f"Precision: {precision_score(y_val, y_pred, average='weighted'):.3f}")
print(f"Recall: {recall_score(y_val, y_pred, average='weighted'):.3f}")
print(f"F1-Score: {f1_score(y_val, y_pred, average='weighted'):.3f}")

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Load new test data
to_predict = pd.read_csv('https://raw.githubusercontent.com/AdamMonroUnil/DSML/main/Data/unlabelled_test_data.csv')

# Predict using the trained model
predicted_difficulties = pipeline.predict(to_predict['sentence'])

# Create a new DataFrame for submission
submission = pd.DataFrame({
    'id': to_predict['id'],  # Replace 'ID' with the actual ID column name if different
    'difficulty': predicted_difficulties
})

# Save the submission DataFrame to a new CSV file
submission.to_csv('submission_DT.csv', index=False)


Accuracy: 0.274
Precision: 0.270
Recall: 0.274
F1-Score: 0.265
Confusion Matrix:
[[94 38 14  9  5  6]
 [57 36 36 15  8  6]
 [48 39 36 18 11 14]
 [18 23 32 35 28 17]
 [17 26 27 23 33 26]
 [18 16 38 31 33 29]]


**kNN**

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load your dataset
df = pd.read_csv('https://raw.githubusercontent.com/AdamMonroUnil/DSML/main/Data/training_data.csv')

# Split the DataFrame into features (X) and target (y)
X = df['sentence']
y = df['difficulty']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a TF-IDF Vectorizer and kNN model
text_transformer = TfidfVectorizer(ngram_range=(1, 2))
model = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors

# Create a pipeline
pipeline = make_pipeline(text_transformer, model)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = pipeline.predict(X_val)

# Calculate metrics
print(f"Accuracy: {accuracy_score(y_val, y_pred):.3f}")
print(f"Precision: {precision_score(y_val, y_pred, average='weighted'):.3f}")
print(f"Recall: {recall_score(y_val, y_pred, average='weighted'):.3f}")
print(f"F1-Score: {f1_score(y_val, y_pred, average='weighted'):.3f}")

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Load new test data
to_predict = pd.read_csv('https://raw.githubusercontent.com/AdamMonroUnil/DSML/main/Data/unlabelled_test_data.csv')
# Predict using the trained model
predicted_difficulties = pipeline.predict(to_predict['sentence'])

# Create a new DataFrame for submission
submission = pd.DataFrame({
    'id': to_predict['id'],  # Replace 'ID' with the actual ID column name if different
    'difficulty': predicted_difficulties
})

# Save the submission DataFrame to a new CSV file
submission.to_csv('submission_kNN.csv', index=False)


Accuracy: 0.333
Precision: 0.371
Recall: 0.333
F1-Score: 0.311
Confusion Matrix:
[[135  20   7   1   1   2]
 [ 88  45  13   2   5   5]
 [ 86  38  30   3   4   5]
 [ 44  32  14  40  13  10]
 [ 37  26  16  26  32  15]
 [ 37  28  23  27  12  38]]


**RANDOM FOREST**

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


# Load your dataset
df = pd.read_csv('https://raw.githubusercontent.com/AdamMonroUnil/DSML/main/Data/training_data.csv')

# Split the DataFrame into features (X) and target (y)
X = df['sentence']
y = df['difficulty']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a TF-IDF Vectorizer
text_transformer = TfidfVectorizer(ngram_range=(1, 2))

# Using Random Forest Classifier
model = RandomForestClassifier()

# Create a pipeline
pipeline = make_pipeline(text_transformer, model)

# GridSearchCV settings for hyperparameter tuning
param_grid = {
    'randomforestclassifier__n_estimators': [100, 200],
    'randomforestclassifier__max_depth': [None, 10, 20]
}

# Using GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='precision_weighted')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Predictions and metrics on validation set
y_pred = grid_search.predict(X_val)

# Calculate metrics
print(f"Accuracy: {accuracy_score(y_val, y_pred):.3f}")
print(f"Precision: {precision_score(y_val, y_pred, average='weighted'):.3f}")
print(f"Recall: {recall_score(y_val, y_pred, average='weighted'):.3f}")
print(f"F1-Score: {f1_score(y_val, y_pred, average='weighted'):.3f}")

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))


# Load new test data
to_predict = pd.read_csv('https://raw.githubusercontent.com/AdamMonroUnil/DSML/main/Data/unlabelled_test_data.csv')

# Predict using the trained model
predicted_difficulties = grid_search.predict(to_predict['sentence'])

# Create a new DataFrame for submission
submission2 = pd.DataFrame({
    'id': to_predict['id'],
    'difficulty': predicted_difficulties
})

# Save the submission DataFrame to a new CSV file
submission2.to_csv('submission_RF.csv', index=False)


Accuracy: 0.373
Precision: 0.404
Recall: 0.373
F1-Score: 0.352
Confusion Matrix:
[[145  13   4   3   1   0]
 [ 97  40  14   2   2   3]
 [ 80  35  39   5   6   1]
 [ 34  21  18  45  24  11]
 [ 33  14  16  36  39  14]
 [ 34  13  18  21  29  50]]
