### 1. Import Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer, f1_score

### 2. Load Preprocessed Dataset

In [6]:
# Load the cleaned and encoded dataset
data = pd.read_csv(r"/Users/dimzografos/Desktop/Assignments/MLPC/preprocessed_airline_sentiment.csv")

# Read the first 5 rows
data.head()

Unnamed: 0,airline_sentiment,text
0,1,said
1,2,plus youve added commercial experience tacky
2,1,didnt today must mean need take another trip
3,0,really aggressive blast obnoxious entertainmen...
4,0,really big bad thing


In [8]:
# Drop null values
data.dropna(subset=['text', 'airline_sentiment'], inplace=True)

In [10]:
# Ensure all text values are strings
data['text'] = data['text'].astype(str)

In [12]:
# Separate feature and target
X = data['text']
y = data['airline_sentiment']

### 3. Define Classifiers and Hyperparameter Grids

In [15]:
# Define classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': LinearSVC(max_iter=1000, dual='auto'),
    'Random Forest': RandomForestClassifier(),
    'Neural Network': MLPClassifier(max_iter=300)
}

In [17]:
# Define parameter grids for each classifier
param_grids = {
    'Logistic Regression': {
        'classifier__C': [0.1, 1, 10],
        'classifier__solver': ['lbfgs']
    },
    'SVM': {
        'classifier__C': [0.1, 1, 10]
    },
    'Random Forest': {
        'classifier__n_estimators': [100, 300],
        'classifier__max_depth': [None, 10, 30]
    },
    'Neural Network': {
        'classifier__hidden_layer_sizes': [(100,), (50, 50)],
        'classifier__activation': ['relu', 'tanh']
    }
}

### 4. Define TF-IDF Vectorization Settings

In [20]:
tfidf = {
    'min_df=5': TfidfVectorizer(min_df=5),
    'max_features=2500': TfidfVectorizer(max_features=2500),
    'max_features=500': TfidfVectorizer(max_features=500)
}

### 5. Hyperparameter Tuning with GridSearchCV

In [23]:
# Set up cross-validation and scoring
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scorer = make_scorer(f1_score, average='weighted')

In [25]:
# Initialize a list to store tuning results
all_results = []

# Loop through each TF-IDF configuration
for tfidf_name, tfidf_vectorizer in tfidf.items():
    print(f"\n============================")
    print(f" TF-IDF Setting: {tfidf_name}")
    print(f"============================")

    # Loop through each classifier and its parameter grid
    for clf_name in classifiers:
        print(f"Tuning {clf_name}...")

        # Create pipeline: TF-IDF vectorizer + classifier
        pipeline = Pipeline([
            ('tfidf', tfidf_vectorizer),
            ('classifier', classifiers[clf_name])
        ])

        # Run Grid Search with 5-fold cross-validation
        grid = GridSearchCV(pipeline, param_grids[clf_name], cv=kf, scoring=scorer, n_jobs=-1)
        grid.fit(X, y)

        # Extract the best model from grid search
        best_model = grid.best_estimator_

        # Evaluate best model using cross-validation (accuracy + F1-score)
        acc = cross_val_score(best_model, X, y, cv=kf, scoring='accuracy').mean()
        f1 = cross_val_score(best_model, X, y, cv=kf, scoring='f1_weighted').mean()

        # Store results in a list of dictionaries
        all_results.append({
            'TF-IDF Setting': tfidf_name,
            'Model': clf_name,
            'Accuracy': round(acc, 4),
            'F1-score': round(f1, 4),
            'Best Parameters': grid.best_params_
        })



 TF-IDF Setting: min_df=5
Tuning Logistic Regression...
Tuning SVM...
Tuning Random Forest...
Tuning Neural Network...

 TF-IDF Setting: max_features=2500
Tuning Logistic Regression...
Tuning SVM...
Tuning Random Forest...
Tuning Neural Network...

 TF-IDF Setting: max_features=500
Tuning Logistic Regression...
Tuning SVM...
Tuning Random Forest...
Tuning Neural Network...




### 6. Export Tuned Results 

In [32]:
# Convert results to DataFrame
results_df = pd.DataFrame(all_results)

# Define output file path
output_path = r"/Users/dimzografos/Desktop/Assignments/MLPC/sentiment_analysis_with_tuning.csv"

# Save the tuning results
results_df.to_csv(output_path, index=False)
print(f"\nTuning results saved to: {output_path}")


Tuning results saved to: /Users/dimzografos/Desktop/Assignments/MLPC/sentiment_analysis_with_tuning.csv
