# Importing the libs

In [55]:
import numpy as np
import pandas as pd
import re #substitution or replacement of substrings
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
# !pip install contractions
import contractions
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()
tfidf_vectorizer = TfidfVectorizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alihi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Defined methods

In [56]:
def clean_text(text):
    # \S matches any character that is not a space tab newline
    text = re.sub(r'http\S+', '', text)
    #remove emojis
    text = text.encode('ascii', 'ignore').decode('ascii')
    # igore => specifies how to handle characters that cannot be represented in ASCII
    # remove htmk tags  => . any char except newline
    text = re.sub(r'<.*>', '', text)
    # remove punctiuations
    text = re.sub(r'[^\w\s]', '', text)
    return text

def remove_stopwords(text):
    words = word_tokenize(text)
    english_stopwords = set(stopwords.words('english'))
    filtered_words = []
    for word in words:
        if word.lower() not in english_stopwords:
            filtered_words.append(word)
    filtered_text = ' '.join(filtered_words)
    return filtered_text

def spell_check_and_correction(text):
    blob = TextBlob(text)
    corrected_text = str(blob.correct())
    return corrected_text

def to_lower(s):
    return s.lower()

def expanding_contractions(text):
    text = contractions.fix(text)
    return text

def tokenization_and_lemmatization(text):
    text_tokenized = []
    
    words = nltk.word_tokenize(text) #Tokenize

    for word in words:
        text_tokenized.append(lemmatizer.lemmatize(word)) #lemmatize
    
    text_tokenized = ' '.join(text_tokenized)
    
    return text_tokenized

In [57]:
def preprocess_text(text):
    text = clean_text(text)
#     text = spell_check_and_correction(text)
    text = expanding_contractions(text)
    text = remove_stopwords(text)
    text = to_lower(text)
    text = tokenization_and_lemmatization(text)

    return text

# Testing the methods
print(preprocess_text("print they should pay all the back all the money plus interest the entire family and everyone who came in with them need to be deported asap why didn't it take two years to bust them here we go again another group stealing from the government and taxpayers a group of somalis stole over four million in government benefits over just  months weve reported on numerous cases like this one where the muslim refugeesimmigrants commit fraud by scamming our systemits way out of control more related"))

print pay back money plus interest entire family everyone came need deported soon possible take two year bust go another group stealing government taxpayer group somali stole four million government benefit month reported numerous case like one muslim refugeesimmigrants commit fraud scamming systemits way control related


# Data analysis

In [58]:
df = pd.read_csv("dataset.csv",encoding = "latin1")
df.head(10)

Unnamed: 0,text,label
0,Print They should pay all the back all the mon...,0.0
1,Why Did Attorney General Loretta Lynch Plead T...,0.0
2,Red State : \nFox News Sunday reported this mo...,0.0
3,Email Kayla Mueller was a prisoner and torture...,0.0
4,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,0.0
5,Print Hillary goes absolutely berserk! She exp...,0.0
6,BREAKING! NYPD Ready To Make Arrests In Weiner...,0.0
7,BREAKING! NYPD Ready To Make Arrests In Weiner...,0.0
8,\nLimbaugh said that the revelations in the Wi...,0.0
9,Email \nThese people are sick and evil. They w...,0.0


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156790 entries, 0 to 156789
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   text    153247 non-null  object 
 1   label   151791 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.4+ MB


In [60]:
df["label"].value_counts()

label
0.0    84308
1.0    67483
Name: count, dtype: int64

In [61]:
df.isna().sum()

text     3543
label    4999
dtype: int64

In [62]:
# remove nan and -inf values
df = df.dropna()
df.isna().sum()

text     0
label    0
dtype: int64

In [63]:
# change the type of the label column to int
df['label'] = df['label'].astype(int)
df

Unnamed: 0,text,label
0,Print They should pay all the back all the mon...,0
1,Why Did Attorney General Loretta Lynch Plead T...,0
2,Red State : \nFox News Sunday reported this mo...,0
3,Email Kayla Mueller was a prisoner and torture...,0
4,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,0
...,...,...
156784,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1
156785,"LONDON (Reuters) - LexisNexis, a provider of l...",1
156786,MINSK (Reuters) - In the shadow of disused Sov...,1
156787,MOSCOW (Reuters) - Vatican Secretary of State ...,1


In [64]:
df.duplicated().sum()

12963

In [65]:
df = df.drop_duplicates()
df

Unnamed: 0,text,label
0,Print They should pay all the back all the mon...,0
1,Why Did Attorney General Loretta Lynch Plead T...,0
2,Red State : \nFox News Sunday reported this mo...,0
3,Email Kayla Mueller was a prisoner and torture...,0
4,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,0
...,...,...
156784,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1
156785,"LONDON (Reuters) - LexisNexis, a provider of l...",1
156786,MINSK (Reuters) - In the shadow of disused Sov...,1
156787,MOSCOW (Reuters) - Vatican Secretary of State ...,1


In [66]:
# shuffle the rows
df = df.sample(frac=1)
df = df.reset_index(drop=True)
# df = df[0:50000]
df

Unnamed: 0,text,label
0,us air strikes in afghanistan increase in sun...,1
1,senate republican leader obama politicize supr...,1
2,TUNIS (Reuters) - The Tunisian navy has arrest...,1
3,Donald Trump and his and the conservative medi...,0
4,u senator graham agrees putin north korea sanc...,1
...,...,...
138665,So words deemed offensive are only offensive i...,0
138666,President Donald Trump fumed against a federal...,0
138667,On MondayÃ¢ÂÂs broadcast of the Fox News Cha...,0
138668,There are a ton of tards who will continue to ...,0


In [67]:
# true = np.count_nonzero(df['label'] == 1)
# false = len(df['label']) - true
# plt.pie([true , false], labels= ["TRUE" , "FALSE"], autopct="%1.1f%%" )
# plt.show(block=False)

In [68]:
import plotly.express as px
# Count the occurrences of 0 and 1 in the 'label' column
label_counts = df['label'].value_counts()
# Extract the actual values (0 and 1) as a list
label_values = label_counts.index.to_list()
# Extract the counts (occurrences) as a list
value_counts = label_counts.to_list()
# Create the pie chart
fig = px.pie(values=value_counts, names=label_values)
fig.update_traces(hoverinfo='label+percent',
                  textinfo='percent', 
                  textfont_size=20,
                  marker=dict(colors=['gold', 'mediumturquoise'], 
                              line=dict(color='#000000', width=2)))
fig.update_layout(
    title_text="Label column",
    title_font_color="white",
    paper_bgcolor="black",
    font_color="white") 
# Add data labels with percentages (optional)
# fig.update_traces(textposition='inside', textinfo='percent+n')  # Adjust position and content
fig.show()

# Data preprocssing

In [69]:
df["text"] = df["text"].apply(preprocess_text)
df

Unnamed: 0,text,label
0,u air strike afghanistan increase sun oct pm u...,1
1,senate republican leader obama politicize supr...,1
2,tunis reuters tunisian navy arrested 550 tunis...,1
3,donald trump conservative medium pushing consp...,0
4,senator graham agrees putin north korea sancti...,1
...,...,...
138665,word deemed offensive offensive white racist p...,0
138666,president donald trump fumed federal judge haw...,0
138667,monday broadcast fox news channel tucker carls...,0
138668,ton tards continue laugh tongue consumed still...,0


In [70]:
# df.to_csv("C:\\Users\\alihi\\Desktop\\Programming\\df_processed.csv", index=False)

In [71]:
print(len(df["text"]))
print(len(df['label']))

138670
138670


In [72]:
# print(newdf.iloc[9,:])
# #newdf['label'] = pd.to_numeric(newdf['label'], errors='coerce')
# #newdf.iloc[9,:]
# newdf = newdf.dropna()
# print(newdf.iloc[9,:])

# Spliting to training set and test set

In [73]:
x_train , x_test , y_train , y_test = train_test_split(df['text'],df['label'],test_size=0.2 , shuffle=False)

In [74]:
df.head(10)

Unnamed: 0,text,label
0,u air strike afghanistan increase sun oct pm u...,1
1,senate republican leader obama politicize supr...,1
2,tunis reuters tunisian navy arrested 550 tunis...,1
3,donald trump conservative medium pushing consp...,0
4,senator graham agrees putin north korea sancti...,1
5,store guardian front page 16yearold migrant cr...,1
6,iraq relocate hundred foreign wife child suspe...,1
7,email increasing symbiosis political leading m...,1
8,mike situation sorrentino thanked jersey shore...,0
9,never underestimate impact single person last ...,0


In [75]:
x_train.head(10)

0    u air strike afghanistan increase sun oct pm u...
1    senate republican leader obama politicize supr...
2    tunis reuters tunisian navy arrested 550 tunis...
3    donald trump conservative medium pushing consp...
4    senator graham agrees putin north korea sancti...
5    store guardian front page 16yearold migrant cr...
6    iraq relocate hundred foreign wife child suspe...
7    email increasing symbiosis political leading m...
8    mike situation sorrentino thanked jersey shore...
9    never underestimate impact single person last ...
Name: text, dtype: object

In [76]:
y_train.head(10)

0    1
1    1
2    1
3    0
4    1
5    1
6    1
7    1
8    0
9    0
Name: label, dtype: int32

# TF-IDF

In [77]:
t1 = TfidfVectorizer()
t2 = TfidfVectorizer()

x_train_new_text_matrix = t1.fit_transform(x_train)
x_test_new_text_matrix = t2.fit_transform(x_test)

In [78]:
x_train

0         u air strike afghanistan increase sun oct pm u...
1         senate republican leader obama politicize supr...
2         tunis reuters tunisian navy arrested 550 tunis...
3         donald trump conservative medium pushing consp...
4         senator graham agrees putin north korea sancti...
                                ...                        
110931    reuters new york time co increase spending cov...
110932    cruz gain online bet trump still republican fa...
110933    terrorist attack killed 49 wounded 53 orlando ...
110934    detroit terrible school unconstitutional geoff...
110935    washington reuters senior trump administration...
Name: text, Length: 110936, dtype: object

In [79]:
print(x_train_new_text_matrix)

  (0, 185235)	0.1856189252279844
  (0, 123359)	0.07823018389893932
  (0, 306088)	0.0697746922787704
  (0, 349186)	0.07864356233109247
  (0, 136966)	0.06900670175211888
  (0, 314554)	0.1297491656271809
  (0, 119731)	0.08948641050072435
  (0, 81600)	0.06047778286019309
  (0, 47353)	0.08034303487053765
  (0, 150326)	0.08178125599250623
  (0, 23856)	0.1657701380356941
  (0, 353806)	0.05094571628441886
  (0, 178561)	0.05829191515963804
  (0, 76064)	0.13893795165564468
  (0, 233433)	0.08486814283312229
  (0, 289403)	0.06800534099858067
  (0, 119814)	0.10400754770720065
  (0, 260334)	0.15126988080411785
  (0, 77141)	0.14190805931698466
  (0, 30225)	0.07565569708083911
  (0, 89862)	0.1328933387395471
  (0, 221679)	0.13837414370957093
  (0, 200032)	0.08638689036749964
  (0, 219096)	0.08091819703699259
  (0, 168878)	0.19103995042023514
  :	:
  (110935, 325899)	0.09569474951651218
  (110935, 99103)	0.015205576988920699
  (110935, 107897)	0.01992597341110392
  (110935, 314308)	0.04954695062693541


In [82]:
t1.vocabulary_

{'air': 25641,
 'strike': 303138,
 'afghanistan': 23863,
 'increase': 155832,
 'sun': 305298,
 'oct': 220909,
 'pm': 246474,
 'drone': 101208,
 'aircraft': 25684,
 'land': 177780,
 'jalalabad': 163932,
 'airport': 25790,
 'afp': 23962,
 'amin': 30764,
 'alemipress': 26997,
 'tv': 328020,
 'kabul': 168878,
 'number': 219096,
 'military': 200032,
 'official': 221679,
 'declared': 89862,
 'america': 30225,
 'conducted': 77141,
 'raid': 260334,
 'figure': 119814,
 'show': 289403,
 'percent': 233433,
 'compared': 76064,
 'last': 178561,
 'year': 353806,
 'afghan': 23856,
 'however': 150326,
 'believe': 47353,
 'country': 81600,
 'fight': 119731,
 'terror': 314554,
 'group': 136966,
 'without': 349186,
 'support': 306088,
 'force': 123359,
 'loading': 185235,
 'senate': 285120,
 'republican': 267958,
 'leader': 179846,
 'obama': 219873,
 'politicize': 247587,
 'supreme': 306337,
 'court': 82034,
 'process': 253723,
 'washington': 343258,
 'reuters': 269915,
 'mitch': 202221,
 'mcconnell': 19