In [19]:
import string
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import joblib
import pickle

In [20]:
cols = ['Message', 'Category']
data = pd.read_csv("roman_urdu.csv", usecols=cols)

In [21]:
data

Unnamed: 0,Message,Category
0,\n Un ki biwi aur waldah Iqbal ka naam Imam Bi...,ham
1,\nAllama Iqbal ke walid Sheikh Noor Muhammad k...,ham
2,\nAllama Iqbal ki waldah ka inteqal9 November ...,ham
3,Aakhirkar is wusat-e-beyan ka intezam ...,ham
4,Aala taleem ke liye Cambridge gae to a...,ham
...,...,...
20010,😏😏😏ok nikal giii,ham
20011,😒😒 ye joh prem katha likhi hai na upr isi ko i...,ham
20012,😛😜😜😛 I miss u my namonay😘😘😘,spam
20013,😆same situation kuch mari b,ham


Text Pre-Processing

In [22]:
def text_process(message):
    
# remove punctuations
    message = [word for word in message if word not in string.punctuation]
# join the message after removing   
    message = ''.join(message)
    
    return [word for word in message.split() if word.lower() not in stopwords.words('english')]
    

# apply the text processing to messages column
data['Message'].apply(text_process)

In [23]:
# create a vectorizer
vectorizer = CountVectorizer().fit(data['Message'])
vec_file = 'vectorizer_urdu'
pickle.dump(vectorizer, open(vec_file, 'wb'))

In [24]:
# transform the messages into bag of words using vectorizer
bag_of_words_messages = vectorizer.transform(data['Message'])

In [25]:
print('Shape of Sparse Matrix: ', bag_of_words_messages.shape)
print('Amount of Non-Zero occurences: ', bag_of_words_messages.nnz)

Shape of Sparse Matrix:  (20015, 32205)
Amount of Non-Zero occurences:  236097


In [26]:
# Create TF-IDF for weights and normalization of vectors
tf_transformer = TfidfTransformer().fit(bag_of_words_messages)
vec_file = 'transformer_urdu'
pickle.dump(tf_transformer, open(vec_file, 'wb'))

In [27]:
tf_idf_messages = tf_transformer.transform(bag_of_words_messages)

In [28]:
print(tf_idf_messages.shape)

(20015, 32205)


In [29]:
# create a training and testing set 
msg_train, msg_test, cat_train, cat_test = train_test_split(tf_idf_messages, data['Category'], test_size = 0.3)

Fitting into Naive Bayes model

In [30]:
nb_classify = MultinomialNB().fit(msg_train,cat_train)

In [31]:
predictionsNB = nb_classify.predict(msg_test)

In [32]:
print(classification_report(predictionsNB, cat_test))

              precision    recall  f1-score   support

         ham       1.00      0.74      0.85      6004
        spam       0.00      0.00      0.00         1

    accuracy                           0.74      6005
   macro avg       0.50      0.37      0.43      6005
weighted avg       1.00      0.74      0.85      6005



In [33]:
with open('naive_bayes_urdu','wb') as f:
    pickle.dump(nb_classify,f)

Fitting into Logistic Regression model

In [34]:
lr_classify = LogisticRegression(max_iter=10000).fit(msg_train,cat_train)

In [35]:
predictionsLR = lr_classify.predict(msg_test)

In [36]:
print(classification_report(predictionsLR, cat_test))

              precision    recall  f1-score   support

         ham       0.98      0.74      0.85      5902
        spam       0.02      0.31      0.04       103

    accuracy                           0.73      6005
   macro avg       0.50      0.53      0.44      6005
weighted avg       0.97      0.73      0.83      6005



In [37]:
with open('logistic_regression_urdu','wb') as f:
    pickle.dump(lr_classify,f)

Fitting into Decision Tree model

In [38]:
dt_classify = DecisionTreeClassifier().fit(msg_train,cat_train)

In [39]:
predictionsDT = dt_classify.predict(msg_test)

In [40]:
print(classification_report(predictionsDT, cat_test))

              precision    recall  f1-score   support

         ham       0.80      0.74      0.77      4779
        spam       0.22      0.28      0.24      1226

    accuracy                           0.65      6005
   macro avg       0.51      0.51      0.51      6005
weighted avg       0.68      0.65      0.66      6005



In [41]:
with open('decision_tree_urdu','wb') as f:
    pickle.dump(dt_classify,f)