## Importing the necessary libraries
- pandas
- sklearn

In [438]:
import pandas as pd
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


## Using pandas library to retrieve the csv file

In [439]:
file = r"C:\Users\Bildad Otieno\Documents\Billy_Repo\Translation_Mod\Eng-Fre.csv"
df = pd.read_csv(file, encoding= 'utf-8')
df = df.replace('�','',regex = True)
df.to_csv("C:\\Users\\Bildad Otieno\\Documents\\Billy_Repo\\Translation_Mod\\Eng-Fre2.csv", index = False)

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours!
2,Run!,Courez!
3,Who?,Qui ?
4,Wow!,a alors!
...,...,...
175616,"Top-down economics never works, said Obama. ""T...","L'conomie en partant du haut vers le bas, a n..."
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous dcourage souv...
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...


## Checking for any null values
Checking for missing values: 
- **df.isnull()** or **df.isna()** - will return true if null
- **df.notnull()** - will return true false if null

Handling missing values:
1)   Removing rows or columns with missing values: **df.dropna()**
2)   Interpolating missing values: **df.interpolate()**
3)   Imputing missing values: You can use **df.fillna(value)** to fill missing values with a specific value, or use more advanced techniques like mean, median, or machine learning algorithms for imputation.

In [440]:
df["French words/sentences"].isna().sum()

0

## Checking for unique values

In [441]:
df.nunique().sum()

289006

## Checking the number of rows
Shape function will return a tuple consisting of 2 indices, 1st (rows,columns)

In [442]:
df.shape[0]

175621

## Checking for number of records
We also could use this to see the number of records in every column.

In [443]:
df.count()

English words/sentences    175621
French words/sentences     175621
dtype: int64

## Checking for the data types of values within the dataframe
We could use **astype(dtype)** to change the data type of records e.g. df.astype(float)


In [444]:
df.dtypes

English words/sentences    object
French words/sentences     object
dtype: object

## Checking for number of duplicates
- Detecting duplicates: **df.duplicated()** to check for duplicate rows.
- Removing duplicates: **df.drop_duplicates()** to remove duplicate rows.

In [445]:
df["English words/sentences"].duplicated().sum()

52521

In [446]:
df.isnull()

Unnamed: 0,English words/sentences,French words/sentences
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
175616,False,False
175617,False,False
175618,False,False
175619,False,False


In [447]:
Eng, Fre = df["English words/sentences"], df["French words/sentences"]

In [449]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

## Removing the Punctuation Marks

In [451]:
def remove_punc(column):
    col2 = []
    for word in column:
        col = []
        for letter in word:
            if letter in string.punctuation:
                letter = letter.replace(letter,'')
            else:
                col.append(letter)
            new_col = "".join(col)
        col2.append(new_col)    
    return col2

In [452]:
remove_punc(Eng)

['Hi',
 'Run',
 'Run',
 'Who',
 'Wow',
 'Fire',
 'Help',
 'Jump',
 'Stop',
 'Stop',
 'Stop',
 'Wait',
 'Wait',
 'Go on',
 'Go on',
 'Go on',
 'Hello',
 'Hello',
 'I see',
 'I try',
 'I won',
 'I won',
 'I won',
 'Oh no',
 'Attack',
 'Attack',
 'Cheers',
 'Cheers',
 'Cheers',
 'Cheers',
 'Get up',
 'Go now',
 'Go now',
 'Go now',
 'Got it',
 'Got it',
 'Got it',
 'Got it',
 'Got it',
 'Hop in',
 'Hop in',
 'Hug me',
 'Hug me',
 'I fell',
 'I fell',
 'I know',
 'I left',
 'I left',
 'I lied',
 'I lost',
 'I paid',
 'Im 19',
 'Im OK',
 'Im OK',
 'Listen',
 'No way',
 'No way',
 'No way',
 'No way',
 'No way',
 'No way',
 'No way',
 'No way',
 'No way',
 'Really',
 'Really',
 'Really',
 'Thanks',
 'We try',
 'We won',
 'We won',
 'We won',
 'We won',
 'Ask Tom',
 'Awesome',
 'Be calm',
 'Be calm',
 'Be calm',
 'Be cool',
 'Be fair',
 'Be fair',
 'Be fair',
 'Be fair',
 'Be fair',
 'Be fair',
 'Be kind',
 'Be nice',
 'Be nice',
 'Be nice',
 'Be nice',
 'Be nice',
 'Be nice',
 'Beat it',
 'C

In [453]:
remove_punc(Fre)

['Salut',
 'Cours',
 'Courez',
 'Qui ',
 'a alors',
 'Au feu ',
 ' laide',
 'Saute',
 'a suffit',
 'Stop',
 'Arrtetoi ',
 'Attends ',
 'Attendez ',
 'Poursuis',
 'Continuez',
 'Poursuivez',
 'Bonjour ',
 'Salut ',
 'Je comprends',
 'Jessaye',
 'Jai gagn ',
 'Je lai emport ',
 'Jai gagn',
 'Oh non ',
 'Attaque ',
 'Attaquez ',
 'Sant ',
 ' votre sant ',
 'Merci ',
 'Tchintchin ',
 'Lvetoi',
 'Va maintenant',
 'Allezy maintenant',
 'Vasy maintenant',
 'Jai pig ',
 'Compris ',
 'Pig',
 'Compris',
 'Tas capt',
 'Monte',
 'Montez',
 'Serremoi dans tes bras ',
 'Serrezmoi dans vos bras ',
 'Je suis tombe',
 'Je suis tomb',
 'Je sais',
 'Je suis parti',
 'Je suis partie',
 'Jai menti',
 'Jai perdu',
 'Jai pay',
 'Jai 19 ans',
 'Je vais bien',
 'a va',
 'coutez ',
 'Cest pas possible',
 'Impossible',
 'En aucun cas',
 'Sans faons',
 'Cest hors de question ',
 'Il nen est pas question ',
 'Cest exclu ',
 'En aucune manire ',
 'Hors de question ',
 'Vraiment',
 'Vrai ',
 'Ah bon ',
 'Merci ',
 '

## Splitting Dataset into 70:30 Ratio

In [448]:
Eng_train, Eng_test, Fre_train, Fre_test = train_test_split(Eng, Fre, test_size= .33, random_state=42)