## 2. Experiments

You will vectorize the text (located in the column `text`) by using the well-known TF-IDF technique. There will be three cases where the vocabulary of `TfidfVectorizer` will be limited to:

1. Contain words that appear in at least 5 documents (hint: `min_df` parameter of `TfidfVectorizer`).
2. Contain 2500 words (hint: `max_features` parameter of `TfidfVectorizer`).
3. Contain 500 words (hint: `max_features` parameter of `TfidfVectorizer`).

The classifiers will be evaluated by using 5-fold cross validation. Make sure that no information will be leaked from the training set to the test set. The values of the four following metrics will be measured:

* $M_1$: Accuracy
* $M_2$: F1-score
* $M_3$: Fit time


## Solution

Please write your solution here, including your code and descriptions. **Do not modify the notebook's structure**.


### 1. Import Libraries

In [None]:
# Import all the necessary libraries for the project
import os
import pandas as pd
import numpy as np
import time
import re
import nltk
import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import make_scorer, accuracy_score, f1_score

### 2. Load Dataset

In [None]:
# Load the twitter database from destination path
data = pd.read_csv(r"/Users/dimzografos/Desktop/Assignments/MLPC/Twitter_US_Airline_Sentiment.csv")

print("Dataframe shape:", data.shape)

# Display the first 10 rows of the dataframe
data.head(10)

In [None]:
# Total of missing values
print(f"Missing values:\n", data.isnull().sum())

In [None]:
# Drop all except relevant features
data.drop(columns=[col for col in data.columns if col not in ["text", "airline_sentiment"]], inplace=True)
data.info()

### 3. Text Preprocessing

In [None]:
# Download required NLTK data (run once if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# Define preprocessing function
def preprocessor(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+|#\w+', '', text)  # Remove mentions and hashtags
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters and numbers
    tokens = word_tokenize(text)  # Tokenize
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize
    return ' '.join(tokens)

In [None]:
# Apply preprocessing to all text data
data['text'] = data['text'].apply(preprocessor)

In [None]:
# Encode airline_sentiment column
le = LabelEncoder()
data['airline_sentiment'] = le.fit_transform(data['airline_sentiment'])

data[['text', 'airline_sentiment']].head()

In [None]:
# Split into features and target
X = data['text']
y = data['airline_sentiment']

### 4. Define Classifiers

In [None]:
# Define the classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42),
    'SVM': LinearSVC(multi_class='ovr', max_iter=1000, random_state=42, dual = True),
    'Random Forest': RandomForestClassifier(n_estimators=1000, random_state=42),
    'Feed-forward Neural Network': MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=300, random_state=42)
}

### 5. Define TD-IDF Vectorization Settings

In [None]:
# Define the three TF-IDF vectorization settings
tfidf = {
    'min_df=5': TfidfVectorizer(min_df=5),
    'max_features=2500': TfidfVectorizer(max_features=2500),
    'max_features=500': TfidfVectorizer(max_features=500)
}

### 6. Create Pipeline and Evaluation Function

In [None]:
# 5-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Create pipeline for each classifier
def create_pipeline(classifier, tfidf_vectorizer):
    return Pipeline([
        ('tfidf', tfidf_vectorizer),
        ('classifier', classifier)
    ])

In [None]:
# Evaluate pipeline using cross-validation
def evaluate_pipeline(pipeline, X, y, kf, scoring):
    
    cv_results = cross_validate(pipeline, X, y, cv=kf, scoring=scoring, return_train_score=False)
    
    return {
        'Accuracy': np.mean(cv_results['test_accuracy']),
        'F1-score': np.mean(cv_results['test_f1_weighted']),
        'Fit Time (s)': np.mean(cv_results['fit_time'])
    }

### 7. Evaluate all Classifiers and IF-IDF Settings

In [None]:
# Function to evaluate all classifiers for each TF-IDF 
def evaluate_all_classifiers(X, y, tfidf, classifiers, kf):
    
    all_results = {}
    
    scoring = {'accuracy': 'accuracy', 'f1_weighted': 'f1_weighted'}
    
    for setting_name, tfidf_vectorizer in tfidf.items():
        print(f"\nTF-IDF: {setting_name} \n")
        
        results = {}
        for name, clf in classifiers.items():
            pipeline = create_pipeline(clf, tfidf_vectorizer)
            metrics = evaluate_pipeline(pipeline, X, y, kf, scoring)
            results[name] = metrics
            
            print(f"{name}:")
            print(f"Accuracy: {metrics['Accuracy']*100:.2f} %")
            print(f"F1-score: {metrics['F1-score']*100:.2f} %")
            print(f"Fit Time: {metrics['Fit Time (s)']:.2f} seconds")
            print()
        
        all_results[setting_name] = results
    return all_results

### 8. Plotting Results

In [None]:
def plot_results(all_results):
    
    tfidf_names = list(all_results.keys())
    classifier_names = list(classifiers.keys())
    
    # Prepare data for plotting
    accuracy_data = {clf: [] for clf in classifier_names}
    f1_data = {clf: [] for clf in classifier_names}
    for setting in tfidf_names:
        for clf in classifier_names:
            accuracy_data[clf].append(all_results[setting][clf]['Accuracy'])
            f1_data[clf].append(all_results[setting][clf]['F1-score'])
    
    # Plot Accuracy
    plt.figure(figsize=(12, 6))
    for clf in classifier_names:
        plt.plot(tfidf_names, accuracy_data[clf], marker='o', label=clf)
    plt.title('Classifier Accuracy Across TF-IDF Settings')
    plt.xlabel('TF-IDF Setting')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)
    plt.show()
    
    # Plot F1-score
    plt.figure(figsize=(12, 6))
    for clf in classifier_names:
        plt.plot(tfidf_names, f1_data[clf], marker='o', label=clf)
    plt.title('Classifier F1-Score Across TF-IDF Settings')
    plt.xlabel('TF-IDF Setting')
    plt.ylabel('F1-Score')
    plt.legend()
    plt.grid(True)
    plt.show()

### 9. Run Experiments and Save Results

In [None]:
# Run Evaluation
all_results = evaluate_all_classifiers(X, y, tfidf, classifiers, kf)

In [None]:
# Run the Plots
plot_results(all_results)

In [None]:
file_name = "sentiment_analysis_no_tuning.csv"
folder_path = r"/Users/dimzografos/Desktop/Assignments/MLPC"
full_path = os.path.join(folder_path, file_name)

os.makedirs(folder_path, exist_ok=True)

# Save as CSV
data.to_csv(full_path, index=False)

print(f"File saved at: {full_path}")

### 1. Import Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer, f1_score

### 2. Load Preprocessed Dataset

In [6]:
# Load the cleaned and encoded dataset
data = pd.read_csv(r"/Users/dimzografos/Desktop/Assignments/MLPC/preprocessed_airline_sentiment.csv")

# Read the first 5 rows
data.head()

Unnamed: 0,airline_sentiment,text
0,1,said
1,2,plus youve added commercial experience tacky
2,1,didnt today must mean need take another trip
3,0,really aggressive blast obnoxious entertainmen...
4,0,really big bad thing


In [8]:
# Drop null values
data.dropna(subset=['text', 'airline_sentiment'], inplace=True)

In [10]:
# Ensure all text values are strings
data['text'] = data['text'].astype(str)

In [12]:
# Separate feature and target
X = data['text']
y = data['airline_sentiment']

### 3. Define Classifiers and Hyperparameter Grids

In [15]:
# Define classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': LinearSVC(max_iter=1000, dual='auto'),
    'Random Forest': RandomForestClassifier(),
    'Neural Network': MLPClassifier(max_iter=300)
}

In [17]:
# Define parameter grids for each classifier
param_grids = {
    'Logistic Regression': {
        'classifier__C': [0.1, 1, 10],
        'classifier__solver': ['lbfgs']
    },
    'SVM': {
        'classifier__C': [0.1, 1, 10]  # Removed max_iter – set during init
    },
    'Random Forest': {
        'classifier__n_estimators': [100, 300],
        'classifier__max_depth': [None, 10, 30]
    },
    'Neural Network': {
        'classifier__hidden_layer_sizes': [(100,), (50, 50)],
        'classifier__activation': ['relu', 'tanh']
    }
}

### 4. Define TF-IDF Vectorization Settings

In [20]:
tfidf = {
    'min_df=5': TfidfVectorizer(min_df=5),
    'max_features=2500': TfidfVectorizer(max_features=2500),
    'max_features=500': TfidfVectorizer(max_features=500)
}

### 5. Hyperparameter Tuning with GridSearchCV

In [23]:
# Set up cross-validation and scoring
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scorer = make_scorer(f1_score, average='weighted')

In [25]:
# Initialize a list to store tuning results
all_results = []

# Loop through each TF-IDF configuration
for tfidf_name, tfidf_vectorizer in tfidf.items():
    print(f"\n============================")
    print(f" TF-IDF Setting: {tfidf_name}")
    print(f"============================")

    # Loop through each classifier and its parameter grid
    for clf_name in classifiers:
        print(f"Tuning {clf_name}...")

        # Create pipeline: TF-IDF vectorizer + classifier
        pipeline = Pipeline([
            ('tfidf', tfidf_vectorizer),
            ('classifier', classifiers[clf_name])
        ])

        # Run Grid Search with 5-fold cross-validation
        grid = GridSearchCV(pipeline, param_grids[clf_name], cv=kf, scoring=scorer, n_jobs=-1)
        grid.fit(X, y)

        # Extract the best model from grid search
        best_model = grid.best_estimator_

        # Evaluate best model using cross-validation (accuracy + F1-score)
        acc = cross_val_score(best_model, X, y, cv=kf, scoring='accuracy').mean()
        f1 = cross_val_score(best_model, X, y, cv=kf, scoring='f1_weighted').mean()

        # Store results in a list of dictionaries
        all_results.append({
            'TF-IDF Setting': tfidf_name,
            'Model': clf_name,
            'Accuracy': round(acc, 4),
            'F1-score': round(f1, 4),
            'Best Parameters': grid.best_params_
        })



 TF-IDF Setting: min_df=5
Tuning Logistic Regression...
Tuning SVM...
Tuning Random Forest...
Tuning Neural Network...

 TF-IDF Setting: max_features=2500
Tuning Logistic Regression...
Tuning SVM...
Tuning Random Forest...
Tuning Neural Network...

 TF-IDF Setting: max_features=500
Tuning Logistic Regression...
Tuning SVM...
Tuning Random Forest...
Tuning Neural Network...




### 6. Export Tuned Results 

In [32]:
# Convert results to DataFrame
results_df = pd.DataFrame(all_results)

# Define output file path
output_path = r"/Users/dimzografos/Desktop/Assignments/MLPC/sentiment_analysis_with_tuning.csv"

# Save the tuning results
results_df.to_csv(output_path, index=False)
print(f"\nTuning results saved to: {output_path}")


Tuning results saved to: /Users/dimzografos/Desktop/Assignments/MLPC/sentiment_analysis_with_tuning.csv
