# <b> Pre Processing in LOTR Data

In [19]:
import os
import sys
sys.path.insert(0, '../')
import pandas as pd
import numpy as np
import re
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

## <b> Loading Data

In [20]:
data_char = pd.read_csv('../data/raw/lotr_characters.csv', index_col=False)
data_scripts = pd.read_csv('../data/raw/lotr_scripts.csv', index_col=False)

## <b> Information about Data

In [21]:
data_char.dtypes

birth     object
death     object
gender    object
hair      object
height    object
name      object
race      object
realm     object
spouse    object
dtype: object

In [22]:
data_scripts.dtypes

Unnamed: 0     int64
char          object
dialog        object
movie         object
dtype: object

# <b> Null Values
    
    Removing all columns that has more than 40% of missing values
    
    Removing all rows with missing values

### <b> Characters

In [23]:
def remove_null_columns(df, columns):    
    data = df.copy()
    data.drop(columns=columns, axis=1, inplace=True)
    return data

def remove_null_rows(df):    
    data = df.copy()
    data.dropna(inplace=True)
    return data

threshold_cols = np.round(data_char.shape[0] * 0.4, 2)
null_counts = data_char.columns[data_char.isnull().sum() > threshold_cols]

data_char = remove_null_columns(data_char, null_counts)
data_char = remove_null_rows(data_char)

In [24]:
data_char.head()

Unnamed: 0,birth,death,gender,name,race
1,TA 2978,"February 26 ,3019",Male,Boromir,Men
3,TA 280,TA 515,Male,Tarcil,Men
5,SA 2709,SA 2962,Male,Ar-Adûnakhôr,Men
7,YT,FA 455,Male,Angrod,Elves
9,SA 3219,SA 3440,Male,Anárion,Men


### <b> Scripts

In [25]:
data_scripts = remove_null_columns(data_scripts, ['Unnamed: 0'])
data_scripts = remove_null_rows(data_scripts)

In [26]:
data_scripts.head()

Unnamed: 0,char,dialog,movie
0,DEAGOL,"Oh Smeagol Ive got one! , Ive got a fish Smeag...",The Return of the King
1,SMEAGOL,"Pull it in! Go on, go on, go on, pull it in!",The Return of the King
2,DEAGOL,Arrghh!,The Return of the King
3,SMEAGOL,Deagol!,The Return of the King
4,SMEAGOL,Deagol!,The Return of the King


In [27]:
data_scripts['dialog'][0:10]

0    Oh Smeagol Ive got one! , Ive got a fish Smeag...
1       Pull it in! Go on, go on, go on, pull it in!  
2                                             Arrghh! 
3                                            Deagol!  
4                                            Deagol!  
5                                            Deagol!  
6                       Give us that! Deagol my love  
7                                               Why?  
8        Because' , it's my birthday and I wants it.  
9                                       My precious.  
Name: dialog, dtype: object

# <b> Text Cleaning

In [31]:
class TextCleasing:
    
    symbols_ = re.compile(r'[+@*#$%^&*”=/,().:°"\'‘’;-]')
    
    def __init__(self, dataframe):
        self.dataframe = dataframe
        
    def clean_text(self, text):
        text = ''.join(str(text))
        text = text.lower()
        text = self.symbols_.sub('', text) 
        return text
    
    def process_stem(self, df, col):    
        df[col] = [lemma.lemmatize(word) for word in df[col]]    
        return df.copy()
    
    def __call__(self, columns):
        data = self.dataframe.copy()        
        for col in columns:                            
            text = [self.clean_text(text) for text in self.dataframe[col]]   
            data[col] = text
        return data

### <b> Scripts

In [32]:
clean = TextCleasing(data_scripts)
data_scripts = clean(['dialog', 'movie', 'char'])
data_scripts.head()

Unnamed: 0,char,dialog,movie
0,deagol,oh smeagol ive got one! ive got a fish smeago...,the return of the king
1,smeagol,pull it in! go on go on go on pull it in!,the return of the king
2,deagol,arrghh!,the return of the king
3,smeagol,deagol!,the return of the king
4,smeagol,deagol!,the return of the king


### <b> Characters

In [33]:
clean = TextCleasing(data_char)
data_char = clean.process_stem(data_char, 'gender')
data_char = clean(['name', 'race', 'gender', 'birth', 'death'])
data_char.head()

Unnamed: 0,birth,death,gender,name,race
1,ta 2978,february 26 3019,male,boromir,men
3,ta 280,ta 515,male,tarcil,men
5,sa 2709,sa 2962,male,aradûnakhôr,men
7,yt,fa 455,male,angrod,elves
9,sa 3219,sa 3440,male,anárion,men


In [49]:
data_char['gender'].value_counts()

male      471
female     76
Name: gender, dtype: int64

In [48]:
data_char = data_char[data_char['gender'] != 'most likely male']
data_char = data_char[data_char['gender'] != 'males']

# <b> Saving Processed Data

In [50]:
data_char.to_csv('../data/processed/lotr_characters.csv', index=False)
data_scripts.to_csv('../data/processed/lotr_scripts.csv', index=False)