In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## 1. Load datasets

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/test.csv")

## 2. Preprocessing function

In [None]:
def clean_tweet(text):
    text = re.sub(r"http\S+", "", text)        # Remove URLs
    text = re.sub(r"@\w+", "", text)           # Remove mentions
    text = re.sub(r"#", "", text)              # Remove hashtag symbol
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text) # Remove special characters
    text = text.lower().strip()                # Lowercase
    return text

# Apply cleaning
train_df["clean_tweet"] = train_df["tweet"].apply(clean_tweet)
test_df["clean_tweet"] = test_df["tweet"].apply(clean_tweet)


## 3. Train/validation split for evaluation

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df["clean_tweet"], train_df["label"], test_size=0.2, random_state=42
)


## 4. Build TF-IDF + SVM pipeline

In [None]:
svm_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=7000, ngram_range=(1,2))), # bigrams help
    ("svm", LinearSVC(class_weight="balanced"))  # handles imbalance
])

# Train on train split and validate

In [None]:
svm_pipeline.fit(X_train, y_train)

val_preds = svm_pipeline.predict(X_val)
print("\nValidation Report:\n")
print(classification_report(y_val, val_preds))

# 5. Train on FULL data & Predict on test.csv and Create submission.csv

In [None]:
svm_pipeline.fit(train_df["clean_tweet"], train_df["label"])
test_preds = svm_pipeline.predict(test_df["clean_tweet"])


submission_df = pd.DataFrame({
    "id": test_df["id"],
    "label": test_preds
})
submission_df.to_csv("submission_svm.csv", index=False)
print("\n✅ submission_svm.csv created!")