## Importing the necessary libraries
- pandas
- sklearn

In [136]:
import pandas as pd
import string
import nltk
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split


## Using pandas library to retrieve the csv file

In [84]:
file = r"C:\Users\Bildad Otieno\Documents\Billy_Repo\Translation_Mod\Eng-Fre.csv"
df = pd.read_csv(file, encoding= 'utf-8')
df = df.replace('�','',regex = True)
#df.to_csv("C:\\Users\\Bildad Otieno\\Documents\\Billy_Repo\\Translation_Mod\\Eng-Fre2.csv", index = False)

## Checking for any null values
Checking for missing values: 
- **df.isnull()** or **df.isna()** - will return true if null
- **df.notnull()** - will return true false if null

Handling missing values:
1)   Removing rows or columns with missing values: **df.dropna()**
2)   Interpolating missing values: **df.interpolate()**
3)   Imputing missing values: You can use **df.fillna(value)** to fill missing values with a specific value, or use more advanced techniques like mean, median, or machine learning algorithms for imputation.

In [85]:
df["French words/sentences"].isna().sum()

0

## Checking for unique values

In [86]:
df.nunique().sum()

289006

## Checking the number of rows
Shape function will return a tuple consisting of 2 indices, 1st (rows,columns)

In [87]:
df.shape[0]

175621

## Checking for number of records
We also could use this to see the number of records in every column.

In [88]:
df.count()

English words/sentences    175621
French words/sentences     175621
dtype: int64

## Checking for the data types of values within the dataframe
We could use **astype(dtype)** to change the data type of records e.g. df.astype(float)


In [89]:
df.dtypes

English words/sentences    object
French words/sentences     object
dtype: object

## Checking for number of duplicates
- Detecting duplicates: **df.duplicated()** to check for duplicate rows.
- Removing duplicates: **df.drop_duplicates()** to remove duplicate rows.

In [90]:
df["English words/sentences"].duplicated().sum()

52521

In [91]:
df.isnull()

Unnamed: 0,English words/sentences,French words/sentences
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
175616,False,False
175617,False,False
175618,False,False
175619,False,False


In [92]:
Eng, Fre = df["English words/sentences"], df["French words/sentences"]

In [93]:
#Printing out a collection of punctuation marks, ASCII characters
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


## Removing the Punctuation Marks

Initially I did this but then realized that I wasn't really using the fully capabilities of the <span style = "color:red">if statement</span>. You notice that I am instead using the else statement to append the letters to my **col** list.

    def remove_punc(column):
        new_column = []
        for word in column:
            col = [] 
            for letter in word:
                if letter in string.punctuation:
                    letter = letter.replace(letter,'')
                else:
                    col.append(letter) #list for individual letters now without punctuation mark
                new_word = "".join(col)
            new_column.append(new_word)    
        return new_column

Instead I used <span style = "color:blue">not in</span> which was more effective and cleaner.

In [94]:
def remove_punc(column):
    new_column = []
    for word in column:
        col = []
        for letter in word:
            if letter not in string.punctuation:
                col.append(letter) #list for individual letters now without punctuation mark
            new_word = "".join(col)
        new_column.append(new_word)    
    return new_column

In [95]:
No_Punc_Eng = remove_punc(Eng)

In [96]:
No_Punc_Fre = remove_punc(Fre)

In [97]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Bildad
[nltk_data]     Otieno\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [98]:
tokenized_Eng = [nltk.word_tokenize(word) for word in No_Punc_Eng]
len(tokenized_Eng)

175621

In [99]:
tokenized_Fre = [nltk.word_tokenize(word) for word in No_Punc_Fre]
len(tokenized_Fre)

175621

In [100]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Bildad
[nltk_data]     Otieno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [101]:
#Verifyinf that we have English and French Stopwords
from nltk.corpus import stopwords
stopwords.fileids()

['arabic',
 'azerbaijani',
 'basque',
 'bengali',
 'catalan',
 'chinese',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hebrew',
 'hinglish',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']

In [102]:
stop_Eng = stopwords.words('english') #179 of them

In [103]:
stop_Fre = stopwords.words('french') #157 of them

In [104]:
No_Stop_Eng = []
for word in tokenized_Eng:
    if word not in stop_Eng:
        No_Stop_Eng.append(word)

In [105]:
len(No_Stop_Eng)

175621

In [106]:
No_Stop_Fre = []
for word in tokenized_Fre:
    if word not in stop_Fre:
        No_Stop_Fre.append(word)

In [107]:
len(No_Stop_Fre)

175621

In [119]:
lower_Fre = []
for words in No_Stop_Fre:
    for word in words:
        lower_Fre.append(word.lower())

In [121]:
lower_Eng = []
for words in No_Stop_Eng:
    for word in words:
        lower_Eng.append(word.lower())

I will opt for lemmatization and not stemming as I did before:


    ps = PorterStemmer()
    print(" {0:25}  {1:25} ".format("--Word(s)--","--Stem--"))
    for word in lower_Eng:
        print("   {0:25}  {1:25} ".format(word,ps.stem(word)))


## Splitting Dataset into 70:30 Ratio

In [108]:
Eng_train, Eng_test, Fre_train, Fre_test = train_test_split(Eng, Fre, test_size= .33, random_state=42)

In [123]:
import nltk
from nltk.stem import PorterStemmer
nltk.download("punkt")

# Initialize Python porter stemmer
ps = PorterStemmer()

# Example inflections to reduce
example_words = ["program","programming","programer","programs","programmed"]

# Perform stemming
print("{0:20}{1:20}".format("--Word--","--Stem--"))
for word in example_words:
   print ("{0:20}{1:20}".format(word, ps.stem(word)))

"""
--Word--            --Stem--            
program             program             
programming         program             
programer           program             
programs            program             
programmed          program

"""

[nltk_data] Downloading package punkt to C:\Users\Bildad
[nltk_data]     Otieno\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
--Word--            --Stem--            
program             program             
programming         program             
programer           program             
programs            program             
programmed          program             


'\n--Word--            --Stem--            \nprogram             program             \nprogramming         program             \nprogramer           program             \nprograms            program             \nprogrammed          program\n\n'

In [135]:
value1 = "Hello"
value2 = "World"
print("{3:13}{0:20}".format(value1, value2))


IndexError: Replacement index 3 out of range for positional args tuple