## Importing the necessary libraries
- pandas
- sklearn

In [1]:
import pandas as pd
import string
import nltk
from sklearn.model_selection import train_test_split


## Using pandas library to retrieve the csv file

In [2]:
file = r"C:\Users\Bildad Otieno\Documents\Billy_Repo\Translation_Mod\Eng-Fre.csv"
df = pd.read_csv(file, encoding= 'utf-8')
df = df.replace('�','',regex = True)
#df.to_csv("C:\\Users\\Bildad Otieno\\Documents\\Billy_Repo\\Translation_Mod\\Eng-Fre2.csv", index = False)

## Checking for any null values
Checking for missing values: 
- **df.isnull()** or **df.isna()** - will return true if null
- **df.notnull()** - will return true false if null

Handling missing values:
1)   Removing rows or columns with missing values: **df.dropna()**
2)   Interpolating missing values: **df.interpolate()**
3)   Imputing missing values: You can use **df.fillna(value)** to fill missing values with a specific value, or use more advanced techniques like mean, median, or machine learning algorithms for imputation.

In [3]:
df["French words/sentences"].isna().sum()

0

## Checking for unique values

In [4]:
df.nunique().sum()

289006

## Checking the number of rows
Shape function will return a tuple consisting of 2 indices, 1st (rows,columns)

In [5]:
df.shape[0]

175621

## Checking for number of records
We also could use this to see the number of records in every column.

In [6]:
df.count()

English words/sentences    175621
French words/sentences     175621
dtype: int64

## Checking for the data types of values within the dataframe
We could use **astype(dtype)** to change the data type of records e.g. df.astype(float)


In [7]:
df.dtypes

English words/sentences    object
French words/sentences     object
dtype: object

## Checking for number of duplicates
- Detecting duplicates: **df.duplicated()** to check for duplicate rows.
- Removing duplicates: **df.drop_duplicates()** to remove duplicate rows.

In [8]:
df["English words/sentences"].duplicated().sum()

52521

In [9]:
df.isnull()

Unnamed: 0,English words/sentences,French words/sentences
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
175616,False,False
175617,False,False
175618,False,False
175619,False,False


In [10]:
Eng, Fre = df["English words/sentences"], df["French words/sentences"]

In [11]:
#Printing out a collection of punctuation marks, ASCII characters
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


## Removing the Punctuation Marks

Initially I did this but then realized that I wasn't really using the fully capabilities of the <span style = "color:red">if statement</span>. You notice that I am instead using the else statement to append the letters to my **col** list.

    def remove_punc(column):
        new_column = []
        for word in column:
            col = [] 
            for letter in word:
                if letter in string.punctuation:
                    letter = letter.replace(letter,'')
                else:
                    col.append(letter) #list for individual letters now without punctuation mark
                new_word = "".join(col)
            new_column.append(new_word)    
        return new_column

Instead I used <span style = "color:blue">not in</span> which was more effective and cleaner.

In [12]:
def remove_punc(column):
    new_column = []
    for word in column:
        col = []
        for letter in word:
            if letter not in string.punctuation:
                col.append(letter) #list for individual letters now without punctuation mark
            new_word = "".join(col)
        new_column.append(new_word)    
    return new_column

In [13]:
No_Punc_Eng = remove_punc(Eng)

In [14]:
No_Punc_Fre = remove_punc(Fre)

In [15]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Bildad
[nltk_data]     Otieno\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
tokenized_Eng = [nltk.word_tokenize(word) for word in No_Punc_Eng]
len(tokenized_Eng)

175621

In [25]:
tokenized_Fre = []
for word in No_Punc_Fre:
    print(word)

Salut
Cours
Courez
Qui 
a alors
Au feu 
 laide
Saute
a suffit
Stop
Arrtetoi 
Attends 
Attendez 
Poursuis
Continuez
Poursuivez
Bonjour 
Salut 
Je comprends
Jessaye
Jai gagn 
Je lai emport 
Jai gagn
Oh non 
Attaque 
Attaquez 
Sant 
 votre sant 
Merci 
Tchintchin 
Lvetoi
Va maintenant
Allezy maintenant
Vasy maintenant
Jai pig 
Compris 
Pig
Compris
Tas capt
Monte
Montez
Serremoi dans tes bras 
Serrezmoi dans vos bras 
Je suis tombe
Je suis tomb
Je sais
Je suis parti
Je suis partie
Jai menti
Jai perdu
Jai pay
Jai 19 ans
Je vais bien
a va
coutez 
Cest pas possible
Impossible
En aucun cas
Sans faons
Cest hors de question 
Il nen est pas question 
Cest exclu 
En aucune manire 
Hors de question 
Vraiment
Vrai 
Ah bon 
Merci 
On essaye
Nous avons gagn
Nous gagnmes
Nous lavons emport
Nous lemportmes
Demande  Tom
Fantastique
Sois calme 
Soyez calme 
Soyez calmes 
Sois dtendu 
Sois juste 
Soyez juste 
Soyez justes 
Sois quitable 
Soyez quitable 
Soyez quitables 
Sois gentil
Sois gentil 
Sois gentil

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Bildad
[nltk_data]     Otieno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
#Verifyinf that we have English and French Stopwords
from nltk.corpus import stopwords
stopwords.fileids()

['arabic',
 'azerbaijani',
 'basque',
 'bengali',
 'catalan',
 'chinese',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hebrew',
 'hinglish',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']

In [20]:
stop_Eng = stopwords.words('english') #179 of them

In [21]:
stop_Fre = stopwords.words('french') #157 of them

In [22]:
No_Stop_Eng = []
for words in tokenized_Eng:
    for word in words:
        if word not in stop_Eng:
            No_Stop_Eng.append(word)

In [23]:
No_Stop_Eng

['Hi',
 'Run',
 'Run',
 'Who',
 'Wow',
 'Fire',
 'Help',
 'Jump',
 'Stop',
 'Stop',
 'Stop',
 'Wait',
 'Wait',
 'Go',
 'Go',
 'Go',
 'Hello',
 'Hello',
 'I',
 'see',
 'I',
 'try',
 'I',
 'I',
 'I',
 'Oh',
 'Attack',
 'Attack',
 'Cheers',
 'Cheers',
 'Cheers',
 'Cheers',
 'Get',
 'Go',
 'Go',
 'Go',
 'Got',
 'Got',
 'Got',
 'Got',
 'Got',
 'Hop',
 'Hop',
 'Hug',
 'Hug',
 'I',
 'fell',
 'I',
 'fell',
 'I',
 'know',
 'I',
 'left',
 'I',
 'left',
 'I',
 'lied',
 'I',
 'lost',
 'I',
 'paid',
 'Im',
 '19',
 'Im',
 'OK',
 'Im',
 'OK',
 'Listen',
 'No',
 'way',
 'No',
 'way',
 'No',
 'way',
 'No',
 'way',
 'No',
 'way',
 'No',
 'way',
 'No',
 'way',
 'No',
 'way',
 'No',
 'way',
 'Really',
 'Really',
 'Really',
 'Thanks',
 'We',
 'try',
 'We',
 'We',
 'We',
 'We',
 'Ask',
 'Tom',
 'Awesome',
 'Be',
 'calm',
 'Be',
 'calm',
 'Be',
 'calm',
 'Be',
 'cool',
 'Be',
 'fair',
 'Be',
 'fair',
 'Be',
 'fair',
 'Be',
 'fair',
 'Be',
 'fair',
 'Be',
 'fair',
 'Be',
 'kind',
 'Be',
 'nice',
 'Be',
 'nice

## Splitting Dataset into 70:30 Ratio

In [24]:
Eng_train, Eng_test, Fre_train, Fre_test = train_test_split(Eng, Fre, test_size= .33, random_state=42)