# Final Project
#### Anna Tigranyan

# Testing the model in order to get categorys of the text

#### Importing libreries

In [1]:
import numpy as np
import pandas as pd
import pickle
import json

import seaborn as sns
import re

import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Toshiba\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# importing train and test datas

with open('train.jsonl', 'rb') as f:
   tr_df = f.readlines()

with open('test.jsonl', 'rb') as f:
   ts_df = f.readlines()

In [3]:
def get_TextAndLabel(data):
    """
    Getting text and  true label of the meme:
    0:not-hateful 1:hateful
    
    Args:
        data: List of bytes
    
    Returns:
        data frame contains text and categorys of meme
    
    """
    df = pd.DataFrame()
    
    categorys = []
    string = []
    
    for i in data:
        i = i.decode('ASCII')# decoding bytes to string
        i = json.loads(i)# converting string to dictionory in order to get label and text from the string
        label = i['label'] # getting the label
        text = i['text'] # getting the text
        
        categorys.append(label) # adding the label to our categoris list
        string.append(text) # adding the text to our texts list
        
    # as we have lists of the texts and categoris let's join them to our data frame df in order to have dataframe of the 
    # texts and labels
    
    df['text'] = string
    df['category'] = categorys
    
    return df 

In [4]:
def get_Text(data):
    """
    Getting text of the meme
    
    Args:
        data: List of bytes
    
    Returns:
        data frame contains texts of memes
    
    """
    df = pd.DataFrame()
    
    string = []
    
    for i in data:
        i = i.decode('ASCII')# decoding bytes to string
        i = json.loads(i)# converting string to dictionory in order to get text from the string
        text = i['text'] # getting the text
        
        string.append(text) # adding the text to our texts list
        
    # as we have lists of the texts let's join them to our data frame df in order to have dataframe of the texts
    
    df['text'] = string
    
    return df 

Now let's get data frame df containing text and category of the meme using ambove written functions:

In [5]:
df_tr = get_TextAndLabel(tr_df)
df_ts = get_Text(ts_df)

In [6]:
# converting dtype of category calumn to categorical
df_tr['category'] = df_tr['category'].astype('category')  

We need to join our test and train datasets in order to do changes 

In [7]:
df = df_tr.append(df_ts)

In [8]:
df.head(3)

Unnamed: 0,text,category
0,its their character not their color that matters,0
1,don't be afraid to love again everyone is not ...,0
2,putting bows on your pet,0


In [9]:
df.tail(3)

Unnamed: 0,text,category
637,rare image of a worker picking cotton,
638,caught me cat and monkey today! this,
639,to the black guy reading this meme i am your f...,


df rows with NaN values are our test dataset

#### Removing punctuations and special characters

In [10]:
df["text"] = df['text'].str.replace('[^\w\s]','')
df.head()

Unnamed: 0,text,category
0,its their character not their color that matters,0
1,dont be afraid to love again everyone is not l...,0
2,putting bows on your pet,0
3,i love everything and everybody except for squ...,0
4,everybody loves chocolate chip cookies even hi...,0


#### Lowercasing

In [11]:
df['text']=df['text'].str.lower()
df.head()

Unnamed: 0,text,category
0,its their character not their color that matters,0
1,dont be afraid to love again everyone is not l...,0
2,putting bows on your pet,0
3,i love everything and everybody except for squ...,0
4,everybody loves chocolate chip cookies even hi...,0


#### Removing Stopwords

In [12]:
stop_words = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

df.head()

Unnamed: 0,text,category
0,character color matters,0
1,dont afraid love everyone like ex,0
2,putting bows pet,0
3,love everything everybody except squirrels hat...,0
4,everybody loves chocolate chip cookies even hi...,0


#### Stemming

In [13]:
ps=PorterStemmer()

In [14]:
def stem(text):
    L=[]
    for i in text.split():
        L.append(ps.stem(i))
        
        
    string=" ".join(L)    
    return string

In [15]:
#applying stemming
df['text'] = df['text'].apply(stem)

#### Vectorizing

In [16]:
#using CountVectorizer() approach to vectorize
coun_vect = CountVectorizer()

In [17]:
vector = coun_vect.fit_transform(df['text'])

In [18]:
vector=vector.toarray()

In [19]:
vector.shape

(9140, 6968)

#### Train Test Splitting

In [20]:
X_train=vector[0:8500] # first 8500 rows of our df
Y_train=df['category'][0:8500] 
X_test=vector[8500:]


## Logistic regression model

In [22]:
model_lr = LogisticRegression(solver='lbfgs')
model_lr.fit(X_train,Y_train)

# prediction
predictions_lr = model_lr.predict(X_test)

In [23]:
predictions_lr[:20]

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0],
      dtype=int64)

______________________

#### Saving our model as a pickle file

In [24]:
pickle.dump(model_lr, open('model.pkl','wb'))