In [None]:
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import re
import nltk
import string
from nltk.text import Text

plt.style.use('fivethirtyeight')
warnings.filterwarnings("ignore")

In [None]:
##importing data

In [None]:
df = pd.read_csv('Twitter Hate Speech.csv')
df.head()

In [None]:
df.tail()

In [None]:
##data information 

In [None]:
print(f'number of observation:{df.shape[0]}')
print(f'number of features:{df.shape[1]}')

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.info()


In [None]:
df.size

In [None]:
text = df.iloc[:, 1:]
text.tail()

In [None]:
label = df.iloc[:, 0:1]
label.tail()

In [None]:
df.isnull().sum()

In [None]:
hate_speech = df[df['label'] == 1].shape[0]
free_speech = df[df['label'] == 0].shape[0]
print('Hate Speech =', hate_speech)
print('Free Speech =', free_speech)

In [None]:
speech = [hate_speech, free_speech]
label = ["Hate Speech", "Free Speech"]

plt.pie(speech, labels = label, shadow = False, wedgeprops = {'edgecolor': 'black'}, 
        autopct = '%1.1f%%', startangle= 90, colors=['red', 'blue'])

plt.tight_layout()
plt.show()

In [None]:
##cleaning the text

In [None]:
def lower_case(text):
    return text.lower()

df['tweet'] = df['tweet'].apply(lower_case)

In [None]:
def remove_punct(text):
    return text.translate(str.maketrans('','',string.punctuation))

df['tweet'] = df['tweet'].apply(remove_punct)

In [None]:
def remove_tag(text):
    newtext= re.sub(r'(@[A-Za-z0-9]+)',"",text)
    return newtext

df['tweet'] = df['tweet'].apply(remove_tag)

In [None]:
def remove_special(text):
    return " ".join(e for e in text.split() if e.isalnum())

df['tweet'] = df['tweet'].apply(remove_special)

In [None]:
def remove_urls(text):
    """Remove URLs from a text string."""
    return re.sub(r"http\S+", "", text)

df['tweet'] = df['tweet'].apply(remove_urls)

In [None]:
#preprocessing

In [None]:
#tokenizing

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
def tokenize(text):
    text = word_tokenize(text)
    return text

df['tweet'] = df['tweet'].apply(tokenize)

In [None]:
#removing stopwords

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

In [None]:
def remove_stop(text):
    text  = [i for i in text if not i in stopwords.words('english')]
    return text

df['tweet'] = df['tweet'].apply(remove_stop)

In [None]:
#lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

In [None]:
def Lemmatize(text):
    word_lem = WordNetLemmatizer()
    text = [word_lem.lemmatize(token) for token in text]
    return text

df['tweet'] = df['tweet'].apply(Lemmatize)

In [None]:
#wordcloud

In [None]:
from wordcloud import WordCloud
from wordcloud import STOPWORDS

In [None]:
# WordCloud (hate speech)
hate_speech = df[df['label'] == 1]   
comment_words = ''
stopwords = set(STOPWORDS)
for val in hate_speech.tweet:
     
    #typecaste 
    val = str(val)
 
    #split
    tokens = val.split()
     
    #Converts each token 
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
     
    comment_words += " ".join(tokens)+" "
 
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(comment_words)
 
# plot                     
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

In [None]:
# WordCloud ( free speech)
free_speech = df[df['label'] == 0]   
comment_words = ''
stopwords = set(STOPWORDS)
for val in free_speech.tweet:
     
    #typecaste
    val = str(val)
 
    #split the value
    tokens = val.split()
     
    #Converts each token 
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
     
    comment_words += " ".join(tokens)+" "
 
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(comment_words)
 
# plot               
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

In [None]:
#extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
vectorizer = TfidfVectorizer(max_features= 2000)

In [None]:
list_to_str = []
for lists in df['tweet']:
    list_to_str.append(' '.join(map(str, lists)))

df['tweet'] = list_to_str

In [None]:
corpus = df['tweet']
text = vectorizer.fit_transform(corpus).toarray()

In [None]:
text.shape

In [None]:
#split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
label = df.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(text, label, test_size=0.2, random_state=0)

In [None]:
print("X_train", X_train.shape)
print("y_train", y_train.shape)

In [None]:
print("X_test", X_test.shape)
print("y_test", y_test.shape)

In [None]:
 #Convolutional Neural Netowrk with LSTM

In [None]:
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, Bidirectional, TimeDistributed, Conv1D, ZeroPadding1D, GRU
from tensorflow.keras.layers import Lambda, Input, Dropout, Masking, BatchNormalization, Activation
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.models import Model

In [None]:
#from tensorflow.python import keras
#from keras.models import Sequential
#from keras.layers import Dense, Flatten
#from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape, Merge
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Reshape

# CNN + LSTM Model
def cnn_lstm(input_dim, output_dim, dropout=0.2, n_layers=1):
 
    # Input data type
    dtype = 'float32'
 
    # ---- Network model ----
    input_data = Input(name='the_input', shape=input_dim, dtype=dtype)
     # Assuming input_dim is a tuple with at least two elements (height, width)
   

# Reshape input to add an extra dimension
    
    # Reshape input to add an extra dimension
  # x = Reshape((input_dim[0], input_dim[1], 1))(input_data)
 # Assuming input_dim is a tuple with at least two elements (height, width)
    height, width = input_dim

# Reshape input to add an extra dimension
    x = Reshape((height, width, 1))(input_data)

    # 1 x 1D convolutional layers with strides 4
    x = Conv1D(filters=256, kernel_size=10, strides=4, name='conv_1')(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(dropout, name='dropout_1')(x)
        
    x = LSTM(128, activation='relu', return_sequences=True,
             dropout=dropout, name='lstm_1')(x)
    x = LSTM(128, activation='relu', return_sequences=False,
              dropout=dropout, name='lstm_2')(x)
    
    x = Dense(units=64, activation='relu', name='fc')(x)
    x = Dropout(dropout, name='dropout_2')(x)
 
    # Output layer with softmax
    y_pred = Dense(units=output_dim, activation='sigmoid', name='sigmoid')(x)
 
    network_model = Model(inputs=input_data, outputs=y_pred)
    
    return network_model
    num_samples, height, width = X_train.shape
    model = cnn_lstm(X_train.shape[1:], 1)
    
  #  model = cnn_lstm((height, width), 1)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam, metrics= 'accuracy')
    model.fit(
    
        X_train, y_train,
        batch_size=128,
        epochs=25,
        validation_data=(X_test, y_test),
)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
# Create the Confusion Matrix
confusion_matrix = confusion_matrix(y_test, y_pred_test)

#[row, column]
TP = confusion_matrix[1, 1]        
TN = confusion_matrix[0, 0]           
FP = confusion_matrix[0, 1]           
FN = confusion_matrix[1, 0]


# Visualize the Matrix
group_names = ['TN','FP','FN','TP']

group_counts = ["{0:0.0f}".format(value) for value in confusion_matrix.flatten()]

group_percentages = ["{0:.2%}".format(value) for value in confusion_matrix.flatten()/np.sum(confusion_matrix)]

labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]

labels = np.asarray(labels).reshape(2,2)

sns.heatmap(confusion_matrix, annot=labels, fmt='', cmap='Greens')

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, roc_auc_score

In [None]:
# Accuracy Score
Accuracy = accuracy_score(y_test, y_pred_test)
print('Accuracy Score:', Accuracy) 

# Precision Score
Precision = precision_score(y_test, y_pred_test)
print('Precision Score:', Precision)   

# True positive Rate (TPR) or Sensitivity or Recall
TPR = recall_score(y_test, y_pred_test)
print('True positive Rate:', TPR)             

# False positive Rate (FPR)
FPR = FP / float(TN + FP)
print('False positive Rate', FPR)                       

# F1 Score or F-Measure or F-Score
F1 = f1_score(y_test, y_pred_test)
print('F1 Score:', F1)                 

# Specificity
Specificity = TN / (TN + FP)
print('Specificity:', Specificity )                    

# Mean Absolute Error
Error = mean_absolute_error(y_test, y_pred_test)
print('Mean Absolute Error:', Error)   

# ROC Area
Roc = roc_auc_score(y_test, y_pred_test)
print('ROC Area:', Roc) 

In [None]:
plt.figure(figsize = (12, 5))

result = [Accuracy, Precision, TPR, FPR, F1, Specificity, Error, Roc]
label = ["Accuracy", "Precision", "TPR", "FPR", "F-Score", "Specificity", "Error", "Roc Area"]
colors=[ 'red', 'green', 'blue', 'darkgoldenrod', 'orange', 'purple', 'brown', 'darkcyan']

plt.bar(label, result, color = colors, edgecolor='black')
plt.show()

In [None]:
import pickle 

In [None]:
TF_IDF= open("cv.pkl","wb")          
pickle.dump(vectorizer,TF_IDF)                                  
TF_IDF.close()   

In [None]:
cnn_lstm = open("model.pkl","wb")          
pickle.dump(model,cnn_lstm)                                  
cnn_lstm.close() 

In [None]:
cv = open("cv.pkl","rb")           
cv = pickle.load(cv)                                 
cv

In [None]:
model = open("model.pkl","rb")           
model = pickle.load(model)      

In [None]:
# Positive News example
comment = ["I support racism. I don't care"]
vect = cv.transform(comment).toarray()
model.predict(vect)

if model.predict(vect) == 1:
    print("Hate Speech")
else:
    print("Free Speech")

In [None]:
# Positive News example
comment = [" I respect the all kind of nationalities"]
vect = cv.transform(comment).toarray()
model.predict(vect)

if model.predict(vect) == 1:
    print("Hate Speech")
else:
    print("Free Speech")