[markdown]
deep_learning_model_improved.py
Improved, runnable notebook-style Python script for the DL portion of the assignment.
Save this as a .py or paste into Jupyter as cells (cells separated by '# %%').

# %% [markdown]
"""
Overview
- Loads provided training and validation CSVs
- Performs EDA with 4+ visualizations
- Preprocesses text (cleaning, tokenization, stopword removal)
- Creates two deep-learning models:
    1) Embedding + BiLSTM
    2) TF-IDF -> Dense network
- Runs experiments varying hyperparameters
- Produces evaluation metrics and plots
- Saves experiment CSV and model artifacts
"""

In [1]:
# %%
# Imports
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

TF_AVAILABLE = True
try:
    import tensorflow as tf
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Conv1D, MaxPooling1D, GlobalMaxPooling1D
    from tensorflow.keras.optimizers import Adam, RMSprop, SGD
    from tensorflow.keras.callbacks import EarlyStopping
except Exception as e:
    TF_AVAILABLE = False
    print("TensorFlow/Keras not available in this environment.", e)

In [2]:
# %%
# Paths
TRAIN_PATH = '../Datasets/twitter_training.csv'
VALID_PATH = '../Datasets/twitter_validation.csv'

In [3]:
# %%
# Load data
def load_datasets(train_path=TRAIN_PATH, valid_path=VALID_PATH):
    train = pd.read_csv(train_path)
    valid = pd.read_csv(valid_path)
    return train, valid

train_df, valid_df = load_datasets()
print(f"Train shape: {train_df.shape}, Valid shape: {valid_df.shape}")
print(train_df.head())

Train shape: (74681, 4), Valid shape: (999, 4)
   2401  Borderlands  Positive  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

  im getting on borderlands and i will murder you all ,  
0  I am coming to the borders and I will kill you...     
1  im getting on borderlands and i will kill you ...     
2  im coming on borderlands and i will murder you...     
3  im getting on borderlands 2 and i will murder ...     
4  im getting into borderlands and i can murder y...     


In [7]:
# %% 
# EDA: Class distribution

# Try to detect label column automatically
possible_label_cols = [c for c in train_df.columns if c.lower() in ('label','sentiment','target')]
if possible_label_cols:
    LABEL_COL = possible_label_cols[0]
    print(f"Using label column: {LABEL_COL}")
else:
    raise ValueError("No label column found! Please check dataset columns.")

# Detect text column
possible_text_cols = [c for c in train_df.columns if c.lower() in ('text','review','content','tweet')]
if possible_text_cols:
    TEXT_COL = possible_text_cols[0]
    print(f"Using text column: {TEXT_COL}")
else:
    raise ValueError("No text column found! Please check dataset columns.")

# Plot distribution
class_counts = train_df[LABEL_COL].value_counts().sort_index()
plt.figure(figsize=(6,4))
class_counts.plot(kind='bar')
plt.title('Class distribution (train)')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()


ValueError: No label column found! Please check dataset columns.

In [None]:
# %%
# EDA: Review length distributions
train_df['char_len'] = train_df[TEXT_COL].astype(str).apply(len)
train_df['word_len'] = train_df[TEXT_COL].astype(str).apply(lambda x: len(x.split()))
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.hist(train_df['char_len'], bins=50)
plt.title('Character length distribution')
plt.subplot(1,2,2)
plt.hist(train_df['word_len'], bins=50)
plt.title('Word count distribution')
plt.show()

In [None]:
# %%
# EDA: Top unigrams
cv = CountVectorizer(stop_words='english', max_features=50)
cv.fit(train_df[TEXT_COL].astype(str))
uni = cv.transform(train_df[TEXT_COL].astype(str))
uni_sum = np.array(uni.sum(axis=0)).flatten()
words = cv.get_feature_names_out()
top_idx = np.argsort(uni_sum)[-20:][::-1]
plt.barh(np.arange(len(top_idx)), uni_sum[top_idx][::-1])
plt.yticks(np.arange(len(top_idx)), words[top_idx][::-1])
plt.title('Top 20 unigrams')
plt.show()

In [None]:
# %%
# EDA: Top bigrams
cv_bigram = CountVectorizer(ngram_range=(2,2), stop_words='english', max_features=50)
cv_bigram.fit(train_df[TEXT_COL].astype(str))
bi = cv_bigram.transform(train_df[TEXT_COL].astype(str))
bi_sum = np.array(bi.sum(axis=0)).flatten()
bi_words = cv_bigram.get_feature_names_out()
topb_idx = np.argsort(bi_sum)[-20:][::-1]
plt.barh(np.arange(len(topb_idx)), bi_sum[topb_idx][::-1])
plt.yticks(np.arange(len(topb_idx)), bi_words[topb_idx][::-1])
plt.title('Top 20 bigrams')
plt.show()

In [None]:
# %%
# Preprocessing
import html
STOPWORDS = set(ENGLISH_STOP_WORDS)

def clean_text(s: str) -> str:
    s = str(s).lower()
    s = html.unescape(s)
    s = re.sub(r'http\S+|www\S+', ' ', s)
    s = re.sub(r'@\w+', ' ', s)
    s = re.sub(r'#', ' ', s)
    s = re.sub(r'[^a-z0-9\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

train_df['clean_text'] = train_df[TEXT_COL].astype(str).apply(clean_text)
valid_df['clean_text'] = valid_df[TEXT_COL].astype(str).apply(clean_text)
print(train_df[['text','clean_text',LABEL_COL]].head())

In [None]:
# %%
# Prepare inputs: sequences
MAX_NUM_WORDS = 20000
MAX_SEQ_LEN = 100
EMBED_DIM = 100

if TF_AVAILABLE:
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token='<OOV>')
    tokenizer.fit_on_texts(train_df['clean_text'])
    X_train_seq = tokenizer.texts_to_sequences(train_df['clean_text'])
    X_valid_seq = tokenizer.texts_to_sequences(valid_df['clean_text'])
    X_train_seq = pad_sequences(X_train_seq, maxlen=MAX_SEQ_LEN)
    X_valid_seq = pad_sequences(X_valid_seq, maxlen=MAX_SEQ_LEN)
    word_index = tokenizer.word_index
    print('Vocab size:', len(word_index))

In [None]:
# %%
# TF-IDF representation
TFIDF_MAX_FEATURES = 20000
tfidf = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, stop_words='english')
X_train_tfidf = tfidf.fit_transform(train_df['clean_text']).toarray()
X_valid_tfidf = tfidf.transform(valid_df['clean_text']).toarray()

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y_train = le.fit_transform(train_df[LABEL_COL])
Y_valid = le.transform(valid_df[LABEL_COL])
print('Classes:', le.classes_)

In [None]:
# %%
# Model definitions
if TF_AVAILABLE:
    def build_lstm_model(vocab_size):
        model = Sequential([
            Embedding(vocab_size, EMBED_DIM, input_length=MAX_SEQ_LEN),
            Bidirectional(LSTM(64)),
            Dropout(0.5),
            Dense(64, activation='relu'),
            Dropout(0.3),
            Dense(1, activation='sigmoid')
        ])
        return model

    def build_cnn_model(vocab_size):
        model = Sequential([
            Embedding(vocab_size, EMBED_DIM, input_length=MAX_SEQ_LEN),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Conv1D(128, 5, activation='relu'),
            GlobalMaxPooling1D(),
            Dense(64, activation='relu'),
            Dropout(0.5),
            Dense(1, activation='sigmoid')
        ])
        return model

    def build_dense_tfidf_model(input_dim):
        model = Sequential([
            Dense(512, activation='relu', input_shape=(input_dim,)),
            Dropout(0.5),
            Dense(128, activation='relu'),
            Dropout(0.3),
            Dense(1, activation='sigmoid')
        ])
        return model

In [None]:
# %%
# Experiments grid
experiments = []
experiment_grid = [
    {'model_type':'lstm','lr':1e-3,'batch_size':64,'optimizer':'adam','epochs':5},
    {'model_type':'cnn','lr':1e-3,'batch_size':64,'optimizer':'rmsprop','epochs':5},
    {'model_type':'tfidf_dense','lr':1e-3,'batch_size':64,'optimizer':'adam','epochs':5}
]

# Training loop placeholder (requires TF)

In [None]:
# %%
# Evaluation helpers
def plot_history(history):
    plt.plot(history.history['loss'], label='train_loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.legend()
    plt.show()

from sklearn.metrics import ConfusionMatrixDisplay
def plot_confusion(cm, labels=None):
    ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels).plot(cmap=plt.cm.Blues)
    plt.show()

# %% [markdown]
# Report and README instructions
# - Generate PDF report with dataset, EDA visuals, preprocessing, models, experiments, evaluation, conclusions.
# - Use `experiments_results.csv` for tables.
# - GitHub repo should include code, report, README.