<a href="https://colab.research.google.com/github/2303a51019/NLP/blob/main/NLP_LAB_O8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
print("Importing libraries...")
import pandas as pd
import numpy as np
import re
import nltk
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from gensim.models import Word2Vec

nltk.download('stopwords')
print("Libraries imported successfully!\n")

Importing libraries...
Libraries imported successfully!



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Step 1: Load Dataset

In [11]:

data = pd.read_csv("/content/sample_data/tweets.csv")
print("Dataset loaded successfully!")
print("Dataset shape:", data.shape)
print("First 5 rows:\n", data.head(), "\n")

Dataset loaded successfully!
Dataset shape: (11370, 5)
First 5 rows:
    id keyword        location  \
0   0  ablaze             NaN   
1   1  ablaze             NaN   
2   2  ablaze   New York City   
3   3  ablaze  Morgantown, WV   
4   4  ablaze             NaN   

                                                text  target  
0  Communal violence in Bhainsa, Telangana. "Ston...       1  
1  Telangana: Section 144 has been imposed in Bha...       1  
2  Arsonist sets cars ablaze at dealership https:...       1  
3  Arsonist sets cars ablaze at dealership https:...       1  
4  "Lord Jesus, your love brings freedom and pard...       0   



Step 2: Preprocessing

In [12]:
stop_words = set(stopwords.words('english'))

def preprocess_tweet(tweet):
    # Lowercase
    tweet = tweet.lower()
    # Remove URLs
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Remove user mentions and hashtags
    tweet = re.sub(r'\@\w+|\#','', tweet)
    # Remove punctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    words = tweet.split()
    # Remove stopwords
    words = [w for w in words if w not in stop_words]
    return words

data['clean_text'] = data['text'].apply(preprocess_tweet)
print("Sample preprocessed tweet:", data['clean_text'].iloc[0], "\n")

Sample preprocessed tweet: ['communal', 'violence', 'bhainsa', 'telangana', 'stones', 'pelted', 'muslims', 'houses', 'houses', 'vehicles', 'set', 'ablaze…'] 



Step 3: Train Word2Vec

In [13]:
print("Training Word2Vec model (skip-gram)...")
w2v_model = Word2Vec(sentences=data['clean_text'], vector_size=100, window=5, min_count=2, sg=1)
print("Word2Vec training completed!")
print("Vocabulary size:", len(w2v_model.wv.index_to_key), "\n")



Training Word2Vec model (skip-gram)...
Word2Vec training completed!
Vocabulary size: 10679 



Step 4: Convert Tweets → Vectors

In [14]:

def tweet_vector(words, model):
    word_vecs = []
    for word in words:
        if word in model.wv:
            word_vecs.append(model.wv[word])
    if len(word_vecs) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(word_vecs, axis=0)

X = np.array([tweet_vector(tweet, w2v_model) for tweet in data['clean_text']])
y = data['target'].values

print("Feature vector shape:", X.shape)
print("Example vector (first tweet):", X[0][:10], "\n")



Feature vector shape: (11370, 100)
Example vector (first tweet): [ 0.09420449  0.17645055  0.05208248  0.03935846  0.10249659 -0.27251661
 -0.11954924  0.41131532 -0.11774229 -0.0800787 ] 



Train-test split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape, "\n")

Training set size: (9096, 100)
Testing set size: (2274, 100) 



Train Logistic Regression

In [16]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("Logistic Regression Training Completed!")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall:", recall_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))
print("\nClassification Report (Logistic Regression):\n", classification_report(y_test, y_pred_lr), "\n")


Logistic Regression Training Completed!
Accuracy: 0.8438874230430958
Precision: 0.7808219178082192
Recall: 0.14393939393939395
F1 Score: 0.24307036247334754

Classification Report (Logistic Regression):
               precision    recall  f1-score   support

           0       0.85      0.99      0.91      1878
           1       0.78      0.14      0.24       396

    accuracy                           0.84      2274
   macro avg       0.81      0.57      0.58      2274
weighted avg       0.83      0.84      0.80      2274
 



Train Random Forest

In [17]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Training Completed!")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("\nClassification Report (Random Forest):\n", classification_report(y_test, y_pred_rf), "\n")


Random Forest Training Completed!
Accuracy: 0.8742304309586632
Precision: 0.7370689655172413
Recall: 0.4318181818181818
F1 Score: 0.5445859872611465

Classification Report (Random Forest):
               precision    recall  f1-score   support

           0       0.89      0.97      0.93      1878
           1       0.74      0.43      0.54       396

    accuracy                           0.87      2274
   macro avg       0.81      0.70      0.74      2274
weighted avg       0.86      0.87      0.86      2274
 



Train SVM

In [18]:
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("SVM Training Completed!")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Precision:", precision_score(y_test, y_pred_svm))
print("Recall:", recall_score(y_test, y_pred_svm))
print("F1 Score:", f1_score(y_test, y_pred_svm))
print("\nClassification Report (SVM):\n", classification_report(y_test, y_pred_svm), "\n")


SVM Training Completed!
Accuracy: 0.8372911169744943
Precision: 0.825
Recall: 0.08333333333333333
F1 Score: 0.15137614678899083

Classification Report (SVM):
               precision    recall  f1-score   support

           0       0.84      1.00      0.91      1878
           1       0.82      0.08      0.15       396

    accuracy                           0.84      2274
   macro avg       0.83      0.54      0.53      2274
weighted avg       0.84      0.84      0.78      2274
 



Compare results

In [19]:
results = {
    "Logistic Regression": f1_score(y_test, y_pred_lr),
    "Random Forest": f1_score(y_test, y_pred_rf),
    "SVM": f1_score(y_test, y_pred_svm)
}
for model, score in results.items():
    print(f"{model}: F1 Score = {score:.4f}")

best_model = max(results, key=results.get)
print("\nBest Model with Word2Vec embeddings:", best_model)

if best_model == "SVM":
    print("Reason: SVM often performs better with dense word embeddings as it finds a good separating hyperplane in high-dimensional space.")
elif best_model == "Logistic Regression":
    print("Reason: Logistic Regression works well with continuous dense embeddings and avoids overfitting on smaller datasets.")
else:
    print("Reason: Random Forest can capture non-linearities but sometimes struggles with high-dimensional embeddings.")

Logistic Regression: F1 Score = 0.2431
Random Forest: F1 Score = 0.5446
SVM: F1 Score = 0.1514

Best Model with Word2Vec embeddings: Random Forest
Reason: Random Forest can capture non-linearities but sometimes struggles with high-dimensional embeddings.
