# Processing for Logistic Regression

In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [6]:
df = pd.read_csv('all_data.csv')

df.columns

Index(['id', 'comment_text', 'split', 'created_date', 'publication_id',
       'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes',
       'disagree', 'toxicity', 'severe_toxicity', 'obscene', 'sexual_explicit',
       'identity_attack', 'insult', 'threat', 'male', 'female', 'transgender',
       'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual',
       'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu',
       'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian',
       'latino', 'other_race_or_ethnicity', 'physical_disability',
       'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
       'other_disability', 'identity_annotator_count',
       'toxicity_annotator_count'],
      dtype='object')

## A bit useless

In [7]:
newdf = df[['id', 'comment_text', 'split', 'publication_id',
       'parent_id', 'article_id', 'toxicity', 'severe_toxicity']]

newdf = newdf.sort_values(by='severe_toxicity', ascending=False)
newdf

Unnamed: 0,id,comment_text,split,publication_id,parent_id,article_id,toxicity,severe_toxicity
596097,5801652,Awesome! Lets cut the head off hate! Lets stab...,train,21,,368010,1.000000,1.000000
207189,5997855,You are murdering feckless SCUM. Your mother ...,train,102,,379957,0.983501,0.644363
601973,5883664,.\n.\nIs there really a God ?\n... I once thou...,train,53,5879007.0,373036,0.900000,0.600000
461912,6180251,DOGS----\nEAT\nSHIT\nSLEEP\nBARK\n\nNIGERS\nEA...,train,21,6179746.0,390746,0.990396,0.591236
528865,5754942,BURN THIS DOG RAPING WHITE NIGER ALIVE\nTIME f...,train,105,,365110,0.973936,0.569070
...,...,...,...,...,...,...,...,...
709894,5163549,"Burnsie: I wish you all the best, but despite ...",train,54,5163479.0,328802,0.000000,0.000000
709893,5687703,Yup.,train,43,5687001.0,360936,0.000000,0.000000
709892,5121964,ANOTHER moped rider speeding AND not wearing a...,train,55,,326223,0.000000,0.000000
709891,5317323,"Scheer was born in Ottawa, Ontario, 1979\n-stu...",train,54,,338336,0.000000,0.000000


## Data Cleaning

In [8]:
import re
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'http\S+', '', text) # Remove URLs
        text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
        text = text.lower()
        # Handle self-censored words and emojis as needed
        pass
    
    else:
        return ""
    return text

# cleaned = df['comment_text'].apply(clean_text)
# cleaned


## Delete rows to 'balance' the dataset

In [9]:
# Read data and train test split
data = pd.read_csv('all_data.csv')
data['cleaned_text'] = data['comment_text'].apply(clean_text)
data.columns
# Specify the columns to consider for toxicity
toxic_columns = ['toxicity', 'severe_toxicity', 'obscene', 'sexual_explicit', 'identity_attack', 'insult', 'threat']

# # function to normalise data (?) 
# def normalise_x(inputX): 
#     min_max_scaler = preprocessing.MinMaxScaler() 
#     dfToNorm_scaled = min_max_scaler.fit_transform(inputX) 
#     x = pd.DataFrame(dfToNorm_scaled) 
#     return x

# Create a MinMaxScaler object
# scaler = preprocessing.MinMaxScaler()

# Normalize the specified columns 
#data[toxic_columns] = scaler.fit_transform(data[toxic_columns])

# Print the normalized data
#print("Normalized Data")
#print(data[toxic_columns].head())


# Update the 'toxicity' label based on the values in the specified columns
data['is_toxic'] = 0
data['is_toxic'] = data.apply(lambda row: 1 if any(row[col] >= 0.2 for col in toxic_columns) else row['is_toxic'], axis=1)

# Check the updated class distribution
print(data['is_toxic'].value_counts())

# Calculate the ratio of toxic to non-toxic samples
toxic_count = data['is_toxic'].sum()
non_toxic_count = len(data) - toxic_count
ratio = non_toxic_count / toxic_count

# If the ratio is still above a certain threshold, remove some non-toxic samples
if ratio > 2:  # Adjust the threshold as per your requirement
    non_toxic_data = data[data['is_toxic'] == 0]
    toxic_data = data[data['is_toxic'] == 1]
    
    # Randomly select a subset of non-toxic samples to keep
    num_samples_to_keep = int(toxic_count)  # Adjust the factor as per your requirement
    non_toxic_data = non_toxic_data.sample(n=num_samples_to_keep, random_state=42)
    
    # Combine the toxic and selected non-toxic samples
    data = pd.concat([toxic_data, non_toxic_data])
    
    # Shuffle the data
    data = data.sample(frac=1, random_state=42).reset_index(drop=True)


is_toxic
0    1570549
1     428967
Name: count, dtype: int64


## Save the processed data as a new csv

In [11]:
data.to_csv('cleaned_data.csv', index=False)

# Tokenization

In [10]:
from gensim.models.doc2vec import Doc2Vec,\
    TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\CasKei\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
# preprocess to make a big list of sentences
tagged_data = [TaggedDocument(words=word_tokenize(_d), tags=[str(i)]) for i, _d in enumerate(data['cleaned_text'])]


## Save the tokenized data as a pickle

In [1]:
import pickle

# save tagged_data
# with open('tagged_data.pickle', 'wb') as handle:
#     pickle.dump(tagged_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
tagged_data = ''
# open and load tags from pickle file
with open('tagged_data.pickle', 'rb') as handle:
    tagged_data = pickle.load(handle)


## Doc2Vec training

In [12]:
# train the Doc2vec model
model = Doc2Vec(vector_size=20,
                min_count=2, epochs=10)
model.build_vocab(tagged_data)
model.train(tagged_data,
            total_examples=model.corpus_count,
            epochs=model.epochs)
 
# get the document vectors
document_vectors = [model.infer_vector(
    word_tokenize(doc)) for doc in data['cleaned_text']]
 
# #  print the document vectors
# for i, doc in enumerate(cleaned):
#     print("Document", i+1, ":", doc)
#     print("Vector:", document_vectors[i])
#     print()

## Save trained document vectors

In [13]:
# save
with open('LR_document_vectors.pickle', 'wb') as handle:
    pickle.dump(document_vectors, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
print(len(document_vectors))
print(len(data))

857934
857934
