# Import Packages

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
import numpy as np

# ML pipeline Functions

## Feature Extraction Function TF-IDF / BoW

In [None]:
def feature_extraction(data,vectorizer=TfidfVectorizer()):
    vectorizer = vectorizer
    X = vectorizer.fit_transform(data['tweet'])
    y = data['topic']
    return X, y, vectorizer

## Split Data Into Training, Validation, and Test Sets

In [None]:
# Split data into training, validation, and test sets
def split_data(X, y, test_size=0.2, val_size=0.1, random_state=42):
    X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=val_size, random_state=random_state)
    return X_train, X_val, X_test, y_train, y_val, y_test

## Model Training And Evaluation

In [None]:
# Model Training and Evaluation
def train_evaluate_model(X_train, y_train, X_val, y_val,algorithm):
    classifier = algorithm
    classifier.fit(X_train, y_train)
    y_val_pred = classifier.predict(X_val)
    accuracy = accuracy_score(y_val, y_val_pred)
    report = classification_report(y_val, y_val_pred)
    return classifier, accuracy, report

## Hyperparameter Tuning

In [None]:
def tune_hyperparameters_random(X_train, y_train, n_iter=50, random_state=42,param_dist=None,classifier=None):
    param_dist = param_dist
    classifier = classifier
    random_search = RandomizedSearchCV(estimator=classifier, param_distributions=param_dist, n_iter=n_iter, cv=5, random_state=random_state)
    random_search.fit(X_train, y_train)
    return random_search.best_estimator_, random_search.best_params_

## Model Evaluation

In [None]:
def evaluate_model(y_true, y_pred,model):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')

    return {
        'modelName':model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }

# Load ,Feature Extraction andSplit Data

In [None]:
# Load and preprocess data
processed_data = pd.read_csv('/content/tweetsData.csv')

In [None]:
processed_data.head()

Unnamed: 0,tweet,lang,topic
0,تضخم سنو مغرب تباطأ خلال ما توجه اقتصاد عام,ar,Economy
1,طبيق مغرب اتفاق جار ريطانيا نفس امتياز اتفاق م...,ar,Economy
2,extensive exchange eib step eu eib work morocc...,en,Economy
3,صور سلا كهرباء ضح نه حدى دول مغرب عرب فساد سوء...,ar,Economy
4,التجارةالخارجية رقم قياس تاريخ تجار ثنائ إسبان...,ar,Economy


In [None]:
X, y, vectorizer = feature_extraction(processed_data,TfidfVectorizer()) # CountVectorizer() by default
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)

# Models

 ## SVM

In [None]:
# Initial model training and evaluation on validation set
classifier, val_accuracy, val_report = train_evaluate_model(X_train, y_train, X_val, y_val,SVC(kernel='rbf'))

In [None]:
print("Initial Validation Accuracy:", val_accuracy)
print("Initial Validation Classification Report:\n", val_report)

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'] + list(np.logspace(-3, 2, 6))
}

best_classifier, best_params = tune_hyperparameters_random(X_train, y_train,param_dist=param_grid,classifier=SVC())

y_val_pred = best_classifier.predict(X_val)

tuned_val_accuracy = accuracy_score(y_val, y_val_pred)
tuned_val_report = classification_report(y_val, y_val_pred)

In [None]:
print("Tuned Validation Accuracy:", tuned_val_accuracy)
print("Tuned Validation Classification Report:\n", tuned_val_report)
print("Best Hyperparameters:", best_params)

In [None]:
# Evaluate the best model on the test set
y_test_pred = best_classifier.predict(X_test)
final_test_accuracy = accuracy_score(y_test, y_test_pred)
final_test_report = classification_report(y_test, y_test_pred)

In [None]:
print("Final Test Accuracy:", final_test_accuracy)
print("Final Test Classification Report:\n", final_test_report)

 ## RandomForestClassifier

In [None]:
# Initial model training and evaluation on validation set
classifier, val_accuracy, val_report = train_evaluate_model(X_train, y_train, X_val, y_val,RandomForestClassifier())

In [None]:
print("Initial Validation Accuracy:", val_accuracy)
print("Initial Validation Classification Report:\n", val_report)

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, 40, 50, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

best_classifier, best_params = tune_hyperparameters_random(X_train, y_train,param_dist=param_grid,classifier=RandomForestClassifier(random_state=42))
y_val_pred = best_classifier.predict(X_val)

tuned_val_accuracy = accuracy_score(y_val, y_val_pred)
tuned_val_report = classification_report(y_val, y_val_pred)

In [None]:
print("Tuned Validation Accuracy:", tuned_val_accuracy)
print("Tuned Validation Classification Report:\n", tuned_val_report)
print("Best Hyperparameters:", best_params)

In [None]:
# Evaluate the best model on the test set
y_test_pred = best_classifier.predict(X_test)
final_test_accuracy = accuracy_score(y_test, y_test_pred)
final_test_report = classification_report(y_test, y_test_pred)

In [None]:
print("Final Test Accuracy:", final_test_accuracy)
print("Final Test Classification Report:\n", final_test_report)

 ## Decision Tree

In [None]:
# Initial model training and evaluation on validation set
classifier, val_accuracy, val_report = train_evaluate_model(X_train, y_train, X_val, y_val,DecisionTreeClassifier()) # with CountVectorizer

In [None]:
print("Initial Validation Accuracy:", val_accuracy)
print("Initial Validation Classification Report:\n", val_report)

In [None]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'auto', 'sqrt', 'log2']
}

best_classifier, best_params = tune_hyperparameters_random(X_train, y_train,param_dist=param_grid,classifier=DecisionTreeClassifier(random_state=42))

y_val_pred = best_classifier.predict(X_val)

tuned_val_accuracy = accuracy_score(y_val, y_val_pred)
tuned_val_report = classification_report(y_val, y_val_pred)

In [None]:
print("Tuned Validation Accuracy:", tuned_val_accuracy)
print("Tuned Validation Classification Report:\n", tuned_val_report)
print("Best Hyperparameters:", best_params)

In [None]:
# Evaluate the best model on the test set
y_test_pred = best_classifier.predict(X_test)
final_test_accuracy = accuracy_score(y_test, y_test_pred)
final_test_report = classification_report(y_test, y_test_pred)

In [None]:
print("Final Test Accuracy:", final_test_accuracy)
print("Final Test Classification Report:\n", final_test_report)

 ## Logistic Regression

In [None]:
# Initial model training and evaluation on validation set
classifier, val_accuracy, val_report = train_evaluate_model(X_train, y_train, X_val, y_val,LogisticRegression(max_iter=1000))

In [None]:
print("Initial Validation Accuracy:", val_accuracy)
print("Initial Validation Classification Report:\n", val_report)

In [None]:
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 300],
    'l1_ratio': np.linspace(0, 1, 10)  # Only used if penalty='elasticnet'
}

best_classifier, best_params = tune_hyperparameters_random(X_train, y_train,param_dist=param_grid,classifier=LogisticRegression(random_state=42))
y_val_pred = best_classifier.predict(X_val)

tuned_val_accuracy = accuracy_score(y_val, y_val_pred)
tuned_val_report = classification_report(y_val, y_val_pred)

In [None]:
print("Tuned Validation Accuracy:", tuned_val_accuracy)
print("Tuned Validation Classification Report:\n", tuned_val_report)
print("Best Hyperparameters:", best_params)

In [None]:
# Evaluate the best model on the test set
y_test_pred = best_classifier.predict(X_test)
final_test_accuracy = accuracy_score(y_test, y_test_pred)
final_test_report = classification_report(y_test, y_test_pred)

In [None]:
print("Final Test Accuracy:", final_test_accuracy)
print("Final Test Classification Report:\n", final_test_report)

 ## ANN