In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords    # nltk -- natural language tool lit : ==> stopwords means those words that do not have much effect to our processing
from nltk.stem.porter import PorterStemmer   # gives the root word about our words
from sklearn.feature_extraction.text import TfidfVectorizer # use to convert text in features data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nexti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# printing the stop words in english
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,label,tweet
0,1,1,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,1,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,1,factsguide: society now #motivation


In [5]:
df.shape

(31962, 3)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [7]:
df.isna().sum()

id       0
label    0
tweet    0
dtype: int64

In [8]:
df['label'].value_counts()

0    18887
1    13075
Name: label, dtype: int64

In [9]:
# separating the data set

tweet_column = df.drop(columns='label', axis=1)
label_column = df['label']

In [10]:
print(tweet_column)

          id                                              tweet
0          1   @user when a father is dysfunctional and is s...
1          2  @user @user thanks for #lyft credit i can't us...
2          3                                bihday your majesty
3          4  #model   i love u take with u all the time in ...
4          5             factsguide: society now    #motivation
...      ...                                                ...
31957  31958  ate @user isz that youuu?ðððððð...
31958  31959    to see nina turner on the airwaves trying to...
31959  31960  listening to sad songs on a monday morning otw...
31960  31961  @user #sikh #temple vandalised in in #calgary,...
31961  31962                   thank you @user for you follow  

[31962 rows x 2 columns]


In [11]:
print(label_column)

0        1
1        0
2        1
3        0
4        1
        ..
31957    1
31958    1
31959    0
31960    1
31961    0
Name: label, Length: 31962, dtype: int64


### Steming :

Is the process of reducing a word to its root word


example:
actor,  actress, acting ---> act

In [12]:
port_stem = PorterStemmer()

In [13]:
twit = tweet_column['tweet']

def stemming(twit):   # creating a function called steaming
    stemmed_content = re.sub('[^a-zA-Z]',' ', twit) # remoing characters
    stemmed_content = stemmed_content.lower() # converting all the words to lower case
    stemmed_content = stemmed_content.split() # splitting the words and converting it to list
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')] # performing steming; reducing each word to its root word and also removing the stop words
    stemmed_content = ' '.join(stemmed_content) #
    return stemmed_content

In [14]:
steem_tweet = tweet_column['tweet'].apply(stemming)

In [15]:
print(steem_tweet)

0        user father dysfunct selfish drag kid dysfunct...
1        user user thank lyft credit use caus offer whe...
2                                           bihday majesti
3                              model love u take u time ur
4                                  factsguid societi motiv
                               ...                        
31957                                   ate user isz youuu
31958    see nina turner airwav tri wrap mantl genuin h...
31959             listen sad song monday morn otw work sad
31960     user sikh templ vandalis calgari wso condemn act
31961                                    thank user follow
Name: tweet, Length: 31962, dtype: object


In [16]:
x = steem_tweet.values
y = label_column.values

In [17]:
print(x)

['user father dysfunct selfish drag kid dysfunct run'
 'user user thank lyft credit use caus offer wheelchair van pdx disapoint getthank'
 'bihday majesti' ... 'listen sad song monday morn otw work sad'
 'user sikh templ vandalis calgari wso condemn act' 'thank user follow']


In [18]:
print(y)

[1 0 1 ... 0 1 0]


In [19]:
###   Bag of words vectorization

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(x)

#test_x_vector = vectorizer.transform(test_x)
 
train_x_array = x.toarray()
print(train_x_array)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [20]:
print(x)

  (0, 29005)	1
  (0, 9012)	1
  (0, 7824)	2
  (0, 24159)	1
  (0, 7558)	1
  (0, 14711)	1
  (0, 23484)	1
  (1, 29005)	2
  (1, 27218)	1
  (1, 16405)	1
  (1, 6005)	1
  (1, 28998)	1
  (1, 4468)	1
  (1, 19699)	1
  (1, 30100)	1
  (1, 29091)	1
  (1, 20637)	1
  (1, 7120)	1
  (1, 10572)	1
  (2, 2784)	1
  (2, 16577)	1
  (3, 17712)	1
  (3, 16103)	1
  (3, 26774)	1
  (3, 27747)	1
  :	:
  (31958, 13194)	1
  (31958, 28469)	1
  (31958, 19094)	1
  (31958, 486)	1
  (31958, 16770)	1
  (31958, 24545)	1
  (31958, 4891)	1
  (31959, 23584)	2
  (31959, 17792)	1
  (31959, 17911)	1
  (31959, 30586)	1
  (31959, 25382)	1
  (31959, 15755)	1
  (31959, 20126)	1
  (31960, 29005)	1
  (31960, 5555)	1
  (31960, 27106)	1
  (31960, 24734)	1
  (31960, 29097)	1
  (31960, 4092)	1
  (31960, 30729)	1
  (31960, 173)	1
  (31961, 29005)	1
  (31961, 27218)	1
  (31961, 9663)	1


### Splitting the dataset to training and testing data

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 2, stratify=y)

In [25]:
print(x.shape, x_train.shape, x_test.shape)

(31962, 31307) (25569, 31307) (6393, 31307)


### Training the Model : Decision Treet and Logistics Regression

#### Making a prediction system

### Decision Tree Model

In [23]:
from sklearn import tree


clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train, y_train)


x_train_prediction = clf.predict(x_train)
training_score_accuracy = accuracy_score(x_train_prediction, y_train)
print('Accuracy score for Decision Tree of the trained data : ', training_score_accuracy)

#x_test_prediction = clf.predict(x_test)
#test_score_accuracy = accuracy_score(x_test_prediction, y_test)
#print('Accuracy score for Decision Tree of the test data : ', test_score_accuracy)

print('\n')

x_new = x_test[300]
prediction = clf.predict(x_new)
print('The Prediction is ', prediction)

if (prediction[0] == 0):
    print('This is not a racist or sexist tweet')
else:
    print('This is a racist or sexist tweet')

Accuracy score for Decision Tree of the trained data :  0.96601353201142


The Prediction is  [1]
This is a racist or sexist tweet


### Logistic Regression Model

In [24]:
model = LogisticRegression()
model.fit(x_train, y_train)

# accuracy score on training data

x_train_prediction = model.predict(x_train)
training_score_accuracy = accuracy_score(x_train_prediction, y_train)
print('Accuracy score of the trained data : ', training_score_accuracy)

# accuracy score on test data

#x_test_prediction = model.predict(x_test)
#test_score_accuracy = accuracy_score(x_test_prediction, y_test)
#print('Accuracy score of the test data : ', test_score_accuracy)

x_new = x_test[300]

print('\n')

prediction = model.predict(x_new)
print('The Prediction is ', prediction)

if (prediction[0] == 0):
    print('This is not a racist or sexist tweet')
else:
    print('This is a racist or sexist tweet')

Accuracy score of the trained data :  0.8211897219288983


The Prediction is  [1]
This is a racist or sexist tweet


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
