In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('../Artifacts/reddit comments.csv')

In [3]:
data.head()

Unnamed: 0,comment_id,self_text,label
0,k70vrzb,Lol what kind of busted translator device you ...,0.0
1,k70vk4r,Translation:\n\nAmen. Kill jews and continue g...,0.0
2,k70vhew,Friendly fact reminder: Israel has been steali...,0.0
3,k70ve4h,"Well, i would never support Hamas, but there a...",0.0
4,k70vaxh,There is absolutely evidence of Hamas taking C...,1.0


## Data preproccesing

In [4]:
data.shape

(10001, 3)

In [5]:
data.duplicated().sum()

np.int64(0)

In [6]:
data.isnull().sum()

comment_id    0
self_text     0
label         2
dtype: int64

### Text Preprocess 

In [7]:
import re
import string

#### Convert uppercase into lowercase

In [8]:
data["self_text"] = data["self_text"].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [9]:
data["self_text"].head(5)

0    lol what kind of busted translator device you ...
1    translation: amen. kill jews and continue gaza...
2    friendly fact reminder: israel has been steali...
3    well, i would never support hamas, but there a...
4    there is absolutely evidence of hamas taking c...
Name: self_text, dtype: object

#### Remove links

In [10]:
data["self_text"] = data['self_text'].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) for x in x.split()))

#### Remove punctuation 

In [11]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

data["self_text"] = data["self_text"].apply(remove_punctuations)

In [13]:
data["self_text"].head(10)

0    lol what kind of busted translator device you ...
1    translation amen kill jews and continue gazan ...
2    friendly fact reminder israel has been stealin...
3    well i would never support hamas but there act...
4    there is absolutely evidence of hamas taking c...
5    do you have the avtual link to the new york ti...
6    i didnt deny anything you said i said israel d...
7    gtyes exactly hamas attacked not the 1 million...
8    firstly even in the best scenario you are conf...
9    oh i guess since hamas is putting the children...
Name: self_text, dtype: object

#### Remove numbers

In [14]:
data["self_text"] = data['self_text'].str.replace(r'\d+', '', regex=True)

#### Remove stop words

In [15]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [16]:
import nltk

In [17]:
nltk.download('stopwords', download_dir='../static/model')

[nltk_data] Downloading package stopwords to ../static/model...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
with open('../static/model/corpora/stopwords/english', 'r') as file:
    sw = file.read().splitlines()

In [19]:
sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [20]:
data["self_text"] = data["self_text"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [21]:
data["self_text"].head()

0    lol kind busted translator device got speak ha...
1    translation amen kill jews continue gazan oppr...
2    friendly fact reminder israel stealing palesti...
3    well would never support hamas actions israel ...
4    absolutely evidence hamas taking captagon drug...
Name: self_text, dtype: object

## Stemming

In [22]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [23]:
data["self_text"] = data["self_text"].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))

In [24]:
data["self_text"].head()

0       lol kind bust translat devic got speak hasbara
1         translat amen kill jew continu gazan oppress
2    friendli fact remind israel steal palestin lan...
3    well would never support hama action israel co...
4    absolut evid hama take captagon drug similar p...
Name: self_text, dtype: object

## Build Vocabulary

In [25]:
from collections import Counter
vocab = Counter()

In [26]:
vocab

Counter()

In [27]:
for sentence in data['self_text']:
    vocab.update(sentence.split())

In [28]:
len(vocab)

13638

In [29]:
data.shape

(10001, 3)

In [30]:
vocab

Counter({'israel': 3947,
         'hama': 2877,
         'peopl': 2519,
         'palestinian': 2127,
         'like': 1568,
         'isra': 1440,
         'jew': 1433,
         'gaza': 1406,
         'would': 1269,
         'war': 1163,
         'think': 1137,
         'one': 1133,
         'kill': 1099,
         'dont': 1054,
         'even': 1024,
         'civilian': 1014,
         'want': 996,
         'say': 994,
         'right': 968,
         'go': 964,
         'arab': 922,
         'get': 920,
         'know': 907,
         'countri': 848,
         'support': 841,
         'also': 823,
         'make': 801,
         'live': 765,
         'state': 759,
         'us': 759,
         'use': 730,
         'world': 724,
         'palestin': 712,
         'bomb': 696,
         'mani': 648,
         'see': 645,
         'year': 643,
         'time': 626,
         'attack': 623,
         'it’': 621,
         'land': 616,
         'take': 611,
         'peac': 609,
         'happen': 

In [31]:
tokens = [key for key in vocab if vocab[key] > 10]

In [32]:
len(tokens)

2337

In [33]:
def save_vocabulary(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w', encoding="utf-8")
    file.write(data)
    file.close()

save_vocabulary(tokens, '../static/model/vocabulary.txt')

In [34]:
data

Unnamed: 0,comment_id,self_text,label
0,k70vrzb,lol kind bust translat devic got speak hasbara,0.0
1,k70vk4r,translat amen kill jew continu gazan oppress,0.0
2,k70vhew,friendli fact remind israel steal palestin lan...,0.0
3,k70ve4h,well would never support hama action israel co...,0.0
4,k70vaxh,absolut evid hama take captagon drug similar p...,1.0
...,...,...,...
9996,k6xxvbz,gtthe peopl ive fight equal right issu sinc pe...,1.0
9997,k6xxv7i,think someon right kick home live mani year,1.0
9998,k6xxunr,nobodi think egypt syria either,1.0
9999,k6xxtox,build arena sole purpos violent protest keep c...,1.0


## Divide dataset

In [35]:
x= data['self_text']
y= data['label']

In [36]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.0-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.0-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.0-cp311-cp311-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.1 MB 799.2 kB/s eta 0:00:14
   - -------------------------------------- 0.5/11.1 MB 799.2 kB/s eta 0:00:14
   -- ------------------------------------- 0.8/11.1 MB 819.2 kB/s eta 0:00:13
   --- ------------------------------------ 1.0/11.1 MB 751.1 kB/s eta 0:00:14
  

In [37]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [38]:
x_train.shape

(8000,)

In [39]:
x_test.shape

(2001,)

In [40]:
x_train

4472                                              condemn
4658    go say shit pissra western govern support geno...
6268                                                thank
4078    complet utterli ridicul europ start hunt musli...
3065                                            want land
                              ...                        
7710    yeah that’ bunch bs harbor let hide plan dumb ...
9719    would mean discuss reconstruct period war well...
6656    gtthe statu west bank militarili occupi territ...
2226                                                right
5388    ohhh nooo tore flag whatev isreal now…they cru...
Name: self_text, Length: 8000, dtype: object

In [41]:
y_test

4870    1.0
6252    1.0
7199    1.0
7240    1.0
9755    1.0
       ... 
6986    1.0
6990    1.0
5659    1.0
8962    1.0
1608    1.0
Name: label, Length: 2001, dtype: float64