In [1]:
import pandas as pd 
from nltk import ngrams
import numpy as np

## Read the data from the files :

### Pre-Processed And Annotated YouTube Data :

In [2]:
data_1 = pd.read_csv('../Data/data_1/PreProcessedYouTubeDataFileAndAnnotated.csv')
data_2 = pd.read_csv('../Data/data_2/PreProcessedYouTubeDataFileAndAnnotated.csv')

In [3]:
data_1 = data_1.iloc[:,1:-1]
data_2 = data_2.iloc[:,1:-1]

### Dictionaries of bad words :
The new bad words have been extracted from the offensive comments that have lenght >= 4.

In [4]:
dictionary = pd.read_csv('../Data/dictionaries/dictionaryWithFrequency.csv')
dictionary_1 = pd.read_csv('../Data/dictionaries/dictionary_with_new_bad_words_1.csv')
dictionary_2 = pd.read_csv('../Data/dictionaries/dictionary_with_new_bad_words_2.csv')

### Annotate :
It is a function used to automaticly annotate a comment as offensive if it contains a bad word present in the dictionary 

In [5]:
def annotate(comment, dictionary):
    dictionary = dict(zip(dictionary["bad_word"], dictionary["frequency"]))
    annotation = "n"
    for n in [1,2,3,4]:
        commentNgrams = ngrams(comment.split(), n)
        for grams in commentNgrams:
            key = ' '.join(grams)
            if key in dictionary : 
                annotation = "p"
    return annotation

In [6]:

data_1['auto_annotation'] = data_1["text"].apply(lambda x: annotate(x,dictionary)) 
data_1['auto_annotation_dict_1'] = data_1["text"].apply(lambda x: annotate(x,dictionary_1)) 
data_1['auto_annotation_dict_2'] = data_1["text"].apply(lambda x: annotate(x,dictionary_2)) 

data_2['auto_annotation'] = data_2["text"].apply(lambda x: annotate(x,dictionary)) 
data_2['auto_annotation_dict_1'] = data_2["text"].apply(lambda x: annotate(x,dictionary_1)) 
data_2['auto_annotation_dict_2'] = data_2["text"].apply(lambda x: annotate(x,dictionary_2)) 


result, result_dict_1, result_dict_2 will be 1 if the comment is annotated with the correct value using the dictionary, dictionary_1, dictionary_2 respectively. 

In [7]:
data_1['result'] = np.where(data_1["offensive/non offensive"] == data_1["auto_annotation"], 1, 0) 
data_1['result_dict_1'] = np.where(data_1["offensive/non offensive"] == data_1["auto_annotation_dict_1"], 1, 0) 
data_1['result_dict_2'] = np.where(data_1["offensive/non offensive"] == data_1["auto_annotation_dict_2"], 1, 0)

data_2['result'] = np.where(data_2["offensive/non offensive"] == data_2["auto_annotation"], 1, 0) 
data_2['result_dict_1'] = np.where(data_2["offensive/non offensive"] == data_2["auto_annotation_dict_1"], 1, 0) 
data_2['result_dict_2'] = np.where(data_2["offensive/non offensive"] == data_2["auto_annotation_dict_2"], 1, 0)

In [9]:
def accuracy(data):
    
    #Calculate the accuracy using the three dictionaries : 
    true_annotation = np.sum(data["result"])
    accuracy = true_annotation/ len(data)
    
    true_annotation = np.sum(data["result_dict_1"])
    accuracy_dict_1 = true_annotation/ len(data)
    
    true_annotation = np.sum(data["result_dict_2"])
    accuracy_dict_2 = true_annotation/ len(data)
    
    
    return accuracy, accuracy_dict_1, accuracy_dict_2

In [10]:
d1_accuracy, d1_accuracy_dict_1, d1_accuracy_dict_2 = accuracy(data_1)
d2_accuracy, d2_accuracy_dict_1, d2_accuracy_dict_2 = accuracy(data_2)

In [11]:
print("Data_1 :")
print(d1_accuracy)
print(d1_accuracy_dict_1)
print(d1_accuracy_dict_2)

Data_1 :
0.5005843396961434
0.7043241137514609
0.7070510323334632


In [12]:
print("Data_2 :")
print(d2_accuracy)
print(d2_accuracy_dict_1)
print(d2_accuracy_dict_2)

Data_2 :
0.4826961107448912
0.5964073829927489
0.6476598549769281


# Using Annotation :
Using Annotation Function to get offensive Comments from the non annotated data

In [13]:
data_1_non_annotated = pd.read_csv('../Data/data_1/PreProcessedYouTubeDataFileAndNonAnnotated.csv')
data_2_non_annotated = pd.read_csv('../Data/data_2/PreProcessedYouTubeDataFileAndNonAnnotated.csv')

In [14]:
data_1_non_annotated['auto_annotation'] = data_1_non_annotated["text"].apply(lambda x: annotate(x,dictionary)) 
data_1_non_annotated['auto_annotation_dict_1'] = data_1_non_annotated["text"].apply(lambda x: annotate(x,dictionary_1)) 
data_1_non_annotated['auto_annotation_dict_2'] = data_1_non_annotated["text"].apply(lambda x: annotate(x,dictionary_2)) 
data_1_non_annotated = data_1_non_annotated.iloc[:,1:]
 

In [15]:
data_2_non_annotated['auto_annotation'] = data_2_non_annotated["text"].apply(lambda x: annotate(x,dictionary)) 
data_2_non_annotated['auto_annotation_dict_1'] = data_2_non_annotated["text"].apply(lambda x: annotate(x,dictionary_1)) 
data_2_non_annotated['auto_annotation_dict_2'] = data_2_non_annotated["text"].apply(lambda x: annotate(x,dictionary_2)) 
data_2_non_annotated = data_2_non_annotated.iloc[:,1:]

In [16]:
def get_auto_annotated_positive(data_non_annotated):
    data_auto_annotated = pd.DataFrame(columns=['author', 'authorChannelUrl', 'text','likeCount', 'publishedAt', 'offensive/non offensive'])
    data_auto_annotated_dict_1 = pd.DataFrame(columns=['author', 'authorChannelUrl', 'text','likeCount', 'publishedAt', 'offensive/non offensive'])
    data_auto_annotated_dict_2 = pd.DataFrame(columns=['author', 'authorChannelUrl', 'text','likeCount', 'publishedAt', 'offensive/non offensive'])

    for index in range(len(data_non_annotated)) :
        comment = data_non_annotated.iloc[index]
        if comment["auto_annotation"] == "p":
            comment["offensive/non offensive"] = "p"
            data_auto_annotated = data_auto_annotated.append({'author':comment["author"] , 'authorChannelUrl':comment["authorChannelUrl"] , 'text':comment["text"] ,'likeCount':comment["likeCount"] , 'publishedAt':comment["publishedAt"] , 'offensive/non offensive': comment["offensive/non offensive"]}, ignore_index=True)

        if comment["auto_annotation_dict_1"] == "p":
            comment["offensive/non offensive"] = "p"
            data_auto_annotated_dict_1 = data_auto_annotated_dict_1.append({'author':comment["author"] , 'authorChannelUrl':comment["authorChannelUrl"] , 'text':comment["text"] ,'likeCount':comment["likeCount"] , 'publishedAt':comment["publishedAt"] , 'offensive/non offensive': comment["offensive/non offensive"]}, ignore_index=True)

        if comment["auto_annotation_dict_2"] == "p":
            comment["offensive/non offensive"] = "p"
            data_auto_annotated_dict_2 = data_auto_annotated_dict_2.append({'author':comment["author"] , 'authorChannelUrl':comment["authorChannelUrl"] , 'text':comment["text"] ,'likeCount':comment["likeCount"] , 'publishedAt':comment["publishedAt"] , 'offensive/non offensive': comment["offensive/non offensive"]}, ignore_index=True)
    return data_auto_annotated, data_auto_annotated_dict_1, data_auto_annotated_dict_2


In [17]:
data_1_auto_annotated, data_1_auto_annotated_dict_1, data_1_auto_annotated_dict_2 = get_auto_annotated_positive(data_1_non_annotated)
data_2_auto_annotated, data_2_auto_annotated_dict_1, data_2_auto_annotated_dict_2 = get_auto_annotated_positive(data_2_non_annotated)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [18]:
data_1_auto_annotated.to_csv('../Data/data_1/auto_annotated.csv')
data_1_auto_annotated_dict_1.to_csv('../Data/data_1/auto_annotated_dict_1.csv')
data_1_auto_annotated_dict_2.to_csv('../Data/data_1/auto_annotated_dict_2.csv')



data_2_auto_annotated.to_csv('../Data/data_2/auto_annotated.csv')
data_2_auto_annotated_dict_1.to_csv('../Data/data_2/auto_annotated_dict_1.csv')
data_2_auto_annotated_dict_2.to_csv('../Data/data_2/auto_annotated_dict_2.csv')