In [3]:
import pandas as pd
import sys,json,pickle,re
from utils import *
from config import *

Original file downloaded from Github (https://github.com/t-davidson/hate-speech-and-offensive-language)

In [4]:
# train_file = "data/labeled_data.csv"
train_file = "/home/ashish/Documents/dd_classification/data/df_version_16_Mar.csv"

Load Train data in dataframe

In [5]:
train_df = pd.read_csv(train_file)
train_df.sample(10)

Unnamed: 0.1,Unnamed: 0,Text,Label
5462,6223,पहले नहीं था मतलब दवाइयों से पहले नहीं था।,2.0
3077,3680,इसके लिए हमें नौकरी चाहिए। नौकरी के आलावा बाकी...,2.0
2850,3423,फैमिली ( English : Family ) को लेके जो भी है |...,0.0
5726,6498,मेरे भाईया ना एक पी आई टी ( english : PIT ) हॉ...,2.0
3649,4325,दवा बता रही हूँ ना डॉक्टर ( english : doctor )...,2.0
1243,1689,जब ठीक रहती हूँ तब भी सुबह - शाम ही खाती हूँ ...,1.0
3837,4518,130 रुपए का बहुत महेंगा पीते है। न सस्ता पीते है।,2.0
5049,5801,"हाँ, अब जब सब सही चल रहा था तो उसका सब सही था,...",2.0
215,565,जो मैं करती हूँ वो भी वही करते है।,2.0
508,862,"चाह , क्या होता था डर लगता था ?",2.0


In [6]:
train_df['Label'].unique()

array([ 0.,  2.,  1., nan])

In [7]:
train_df = train_df[['Text','Label']]
train_df.isnull().any()
train_df.dropna(inplace = True)
train_df.isnull().any()

Text     False
Label    False
dtype: bool

In [8]:
train_df['Label'].unique()

array([0., 2., 1.])

In [9]:
train_df.duplicated().any()

False

In [10]:
train_df['Label'] = train_df['Label'].apply(str)

In [11]:
train_df['Label'].unique()
 

array(['0.0', '2.0', '1.0'], dtype=object)

In [12]:
mapping = {'0.0': 'intro', '1.0': 'dd', '2.0' : 'unknown'}
train_df = train_df.replace({'Label': mapping})

In [13]:
train_df = train_df[~(train_df['Label'] == "unknown")]


In [14]:
train_df.head(5)

Unnamed: 0,Text,Label
0,मैं अपने लिए तो अपने आप बनाती हूँ |,intro
1,और वो लोग अपने लिए अपने आप बनाते है उनकी फॅमिली |,intro
3,नहीं वो है बड़ी वाली तो जो उनकी बेटी है और जो ...,intro
5,हांजी मेरी (english : starting ) वैसे राजोरी ग...,intro
8,नहीं नहीं (english : Marriage ) से पहले में (e...,dd


In [15]:
train_df['Label'].unique()

array(['intro', 'dd'], dtype=object)

Find Unique labels and create label map

In [16]:
# 0 - hate speech 1 - offensive language 2 - neither as found in documentation
labels = list(train_df["Label"].unique())
print(labels)
label2ind = {'intro':0, 'dd':1}
ind2label = {i:l for l,i in label2ind.items()}
with open("data/label2ind.json","w") as f:
    json.dump(label2ind,f,indent=1)
with open("data/ind2label.json","w") as f:
    json.dump(ind2label,f,indent=1)

['intro', 'dd']


Create a new dataframe with only text and label columnn, also dump in CSV

In [17]:
new_train_df = train_df[["Text","Label"]]


# define preprocessing function
def preprocess_tweet(tweet):
    # remove usernames, URLs, and special characters
    tweet = re.sub("@[^\s]+", "", tweet)
    tweet = re.sub("(http|https)://[^\s]*", "", tweet)
    tweet = re.sub("[^a-zA-Z0-9\u0900-\u097F]+", " ", tweet)
    # remove extra spaces
    tweet = re.sub("\s+", " ", tweet).strip()
    return tweet

# apply preprocessing function to "tweet" column

# new_train_df["Text"] = new_train_df["Text"].map(lambda x:ind2label[x])

new_train_df["Text"] = new_train_df["Text"].apply(preprocess_tweet)

# new_train_df["Text"] = new_train_df["Text"].map(lambda x:' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|RT"," ",x).split()))
print(new_train_df.head())


                                                Text  Label
0                  मैं अपने लिए तो अपने आप बनाती हूँ  intro
1    और वो लोग अपने लिए अपने आप बनाते है उनकी फॅमिली  intro
3  नहीं वो है बड़ी वाली तो जो उनकी बेटी है और जो ...  intro
5  हांजी मेरी english starting वैसे राजोरी गार्डन...  intro
8  नहीं नहीं english Marriage से पहले में english...     dd


In [18]:
train_file_path = "/home/ashish/Documents/dd_classification/data/new_train_df.csv"
new_train_df.to_csv(train_file_path,sep="@")

Prepare Train and Validation set

In [19]:
train_csv_file = "/home/ashish/Documents/dd_classification/data/new_train_df.csv"


# load all data
all_texts,all_labels = load_partial_data(train_csv_file,n=-1)
# split data in Train and Validation
train_data,val_data = split_for_train(all_texts,all_labels)
x_train,y_train = train_data[0],train_data[1]
x_val,y_val = val_data[0],val_data[1]
# Further split Train data in Test as Original Kaggle Test data does not have labels
val_data,test_data = split_for_train(x_val,y_val,val_split=0.2)
x_val,y_val = val_data[0],val_data[1]
x_test,y_test = test_data[0],test_data[1]
print("Total %d x_train, %d y_train"%(len(x_train),len(y_train)))
print("Total %d x_val, %d y_val"%(len(x_val),len(y_val)))
print("Total %d x_test, %d y_test"%(len(x_test),len(y_test)))

Total 2393 x_train, 2393 y_train
Total 821 x_val, 821 y_val
Total 203 x_test, 203 y_test


Dump all the data in PKL file

In [20]:
with open(train_pkl,"wb") as f:
    pickle.dump((x_train,y_train),f)
with open(test_pkl,"wb") as f:
    pickle.dump((x_test,y_test),f)
with open(validation_pkl,"wb") as f:
    pickle.dump((x_val,y_val),f)

Load Original Kaggle Test data 

In [8]:
test_df = pd.read_csv(test_file,delimiter=" ::: ")
print(test_df.head())

  test_df = pd.read_csv(test_file,delimiter=" ::: ")


   1         Edgar's Lunch (1998)  \
0  2     La guerra de papá (1977)   
1  3  Off the Beaten Track (2010)   
2  4       Meu Amigo Hindu (2015)   
3  5            Er nu zhai (1955)   
4  6           Riddle Room (2016)   

  L.R. Brane loves his life - his car, his apartment, his job, but especially his girlfriend, Vespa. One day while showering, Vespa runs out of shampoo. L.R. runs across the street to a convenience store to buy some more, a quick trip of no more than a few minutes. When he returns, Vespa is gone and every trace of her existence has been wiped out. L.R.'s life becomes a tortured existence as one strange event after another occurs to confirm in his mind that a conspiracy is working against his finding Vespa.  
0  Spain, March 1964: Quico is a very naughty chi...                                                                                                                                                                                                                   