In [1]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

nlp = spacy.blank('ur')

import numpy as np

In [2]:
df = pd.read_csv("final_combined_mobandus.csv")

In [3]:
df.head()

Unnamed: 0,Text,Label
0,کہ کے لے لی شام دلے کی,o
1,اب اگر اس نے کچھ جواب دیا تو اس کی گانڈ مار دی...,o
2,اب ان چوتیوں نے وہ جو کنسرٹ ہو رہا تھا وہ بھی ...,h
3,اب ان کی گانڈ میں ہاتھ ڈال کر انتڑیاں نکالے گا...,o
4,اب تو اتنی مار دی تیری بھائی نے کہ اب تو شادی ...,o


In [5]:
df.shape

(9005, 2)

In [11]:
df['Label'].value_counts()


Label
o    4566
h    3304
n     991
Name: count, dtype: int64

In [12]:
#remove and NaN from df
df = df.dropna()


In [13]:
corpus = ' '.join(df['Text'])

nlp.max_length  = 10000000

tokens = nlp(corpus)
# Calculate the vocabulary size
vocab = set(tokens)
vocab_size = len(vocab)

# Print the vocabulary size
print("Vocabulary Size:", vocab_size)

Vocabulary Size: 290128


In [14]:
vectorizer = TfidfVectorizer()

# Fit the vectorizer to your text data
vectorizer.fit(df['Text'])

# Transform the text data into TF-IDF vectors
vectors = vectorizer.transform(df['Text']).toarray()
vectors.shape

(8861, 19156)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(vectors, df['Label'], test_size=0.2, random_state=42)


In [40]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the Random Forest classifier to the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_model.predict(X_test)

In [41]:
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average='weighted')
recall = metrics.recall_score(y_test, y_pred, average='weighted')
f1_score = metrics.f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

Accuracy: 0.8522278623801467
Precision: 0.8626644714013947
Recall: 0.8522278623801467
F1-Score: 0.8474463568192321


#### Now we try the smote oversampling technique

In [42]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split




# Split the data into features (X) and labels (y)
X = df['Text']
y = df['Label']

# Convert text data to numerical format using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the class distribution
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Now X_resampled and y_resampled contain the oversampled data





In [43]:
print(X_resampled.shape)
X_train.shape

(11001, 19156)


(7088, 19156)

In [44]:

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the Random Forest classifier to the training data
rf_model.fit(X_resampled, y_resampled)

# Make predictions on the test data
y_pred = rf_model.predict(X_test)


In [45]:
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average='weighted')
recall = metrics.recall_score(y_test, y_pred, average='weighted')
f1_score = metrics.f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

Accuracy: 0.8527918781725888
Precision: 0.8612380091110491
Recall: 0.8527918781725888
F1-Score: 0.8491834062465644


## Now we try using a word2vec vectorization scheme

In [46]:
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize

In [48]:
df['tokenized_text'] = df['Text'].apply(word_tokenize)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['tokenized_text'], df['Label'], test_size=0.2, random_state=42)

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.train(X_train, total_examples=len(X_train), epochs=10)


(1776613, 2191980)

In [49]:
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float32")
    n_words = 0
    for word in words:
        if word in vocabulary:
            n_words += 1
            feature_vector = np.add(feature_vector, model.wv[word])
    if n_words > 0:
        feature_vector = np.divide(feature_vector, n_words)
    return feature_vector

In [50]:
train_vectors = [average_word_vectors(tokens, word2vec_model, word2vec_model.wv.index_to_key, 100) for tokens in X_train]
test_vectors = [average_word_vectors(tokens, word2vec_model, word2vec_model.wv.index_to_key, 100) for tokens in X_test]


In [54]:
rf_classifier_word2vec = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_word2vec.fit(train_vectors, y_train)
predictions_word2vec = rf_classifier_word2vec.predict(test_vectors)

Accuracy: 0.835307388606881
Precision: 0.8437691555584621
Recall: 0.835307388606881
F1-Score: 0.8298529932430225


### using both tf-idf and word2vec

In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize
import numpy as np
from scipy.sparse import hstack

# Sample dataset (replace with your own dataset)


# Tokenize the text using NLTK
df['tokenized_text'] = df['Text'].apply(word_tokenize)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Label'], test_size=0.2, random_state=42)

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=X_train.apply(word_tokenize), vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.train(X_train.apply(word_tokenize), total_examples=len(X_train), epochs=10)

# Function to average word vectors for a document
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float32")
    n_words = 0
    for word in words:
        if word in vocabulary:
            n_words += 1
            feature_vector = np.add(feature_vector, model.wv[word])
    if n_words > 0:
        feature_vector = np.divide(feature_vector, n_words)
    return feature_vector


# Create Word2Vec feature vectors
train_w2v_vectors = np.array([average_word_vectors(word_tokenize(text), word2vec_model, word2vec_model.wv.index_to_key, 100) for text in X_train])
test_w2v_vectors = np.array([average_word_vectors(word_tokenize(text), word2vec_model, word2vec_model.wv.index_to_key, 100) for text in X_test])

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Combine Word2Vec and TF-IDF features
X_train_combined = hstack([train_w2v_vectors, X_train_tfidf])
X_test_combined = hstack([test_w2v_vectors, X_test_tfidf])



In [59]:
# Train Random Forest classifier on combined features
rf_classifier_combined = RandomForestClassifier(random_state=42)
rf_classifier_combined.fit(X_train_combined, y_train)
predictions_combined = rf_classifier_combined.predict(X_test_combined)

# Evaluate the performance on combined features
accuracy = metrics.accuracy_score(y_test, predictions_combined)
precision = metrics.precision_score(y_test, predictions_combined, average='weighted')
recall = metrics.recall_score(y_test, predictions_combined, average='weighted')
f1_score = metrics.f1_score(y_test,predictions_combined, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

Accuracy: 0.8454596728708403
Precision: 0.8537591712481136
Recall: 0.8454596728708403
F1-Score: 0.8394404432643452


In [60]:
from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression(random_state=42)
lr_classifier.fit(X_train_tfidf, y_train)
predictions_lr = lr_classifier.predict(X_test_tfidf)

accuracy = metrics.accuracy_score(y_test, predictions_lr)
precision = metrics.precision_score(y_test, predictions_lr, average='weighted')
recall = metrics.recall_score(y_test,predictions_lr, average='weighted')
f1_score = metrics.f1_score(y_test,predictions_lr, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

Accuracy: 0.8550479413423576
Precision: 0.8643033486557776
Recall: 0.8550479413423576
F1-Score: 0.8493149370038469


### now we try using da vinci
