## Importing the necessary libraries
- pandas
- sklearn

In [2]:
import pandas as pd
import string
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import spacy
from sklearn.model_selection import train_test_split


## Using pandas library to retrieve the csv file

In [3]:
file = r"C:\Users\Bildad Otieno\Documents\Billy_Repo\Translation_Mod\Eng-Fre.csv"
df = pd.read_csv(file, encoding= 'utf-8')
df = df.replace('�','',regex = True)
#df.to_csv("C:\\Users\\Bildad Otieno\\Documents\\Billy_Repo\\Translation_Mod\\Eng-Fre2.csv", index = False)

## Checking for any null values
Checking for missing values: 
- **df.isnull()** or **df.isna()** - will return true if null
- **df.notnull()** - will return true false if null

Handling missing values:
1)   Removing rows or columns with missing values: **df.dropna()**
2)   Interpolating missing values: **df.interpolate()**
3)   Imputing missing values: You can use **df.fillna(value)** to fill missing values with a specific value, or use more advanced techniques like mean, median, or machine learning algorithms for imputation.

In [4]:
df["French words/sentences"].isna().sum()

0

## Checking for unique values

In [5]:
df.nunique().sum()

289032

## Checking the number of rows
Shape function will return a tuple consisting of 2 indices, 1st (rows,columns)

In [6]:
df.shape[0]

175621

## Checking for number of records
We also could use this to see the number of records in every column.

In [7]:
df.count()

English words/sentences    175621
French words/sentences     175621
dtype: int64

## Checking for the data types of values within the dataframe
We could use **astype(dtype)** to change the data type of records e.g. df.astype(float)


In [8]:
df.dtypes

English words/sentences    object
French words/sentences     object
dtype: object

## Checking for number of duplicates
- Detecting duplicates: **df.duplicated()** to check for duplicate rows.
- Removing duplicates: **df.drop_duplicates()** to remove duplicate rows.

In [9]:
df["English words/sentences"].duplicated().sum()

52521

In [10]:
df.isnull()

Unnamed: 0,English words/sentences,French words/sentences
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
175616,False,False
175617,False,False
175618,False,False
175619,False,False


In [11]:
Eng, Fre = df["English words/sentences"], df["French words/sentences"]

In [12]:
#Printing out a collection of punctuation marks, ASCII characters
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


## Removing the Punctuation Marks

Initially I did this but then realized that I wasn't really using the fully capabilities of the <span style = "color:red">if statement</span>. You notice that I am instead using the else statement to append the letters to my **col** list.

    def remove_punc(column):
        new_column = []
        for word in column:
            col = [] 
            for letter in word:
                if letter in string.punctuation:
                    letter = letter.replace(letter,'')
                else:
                    col.append(letter) #list for individual letters now without punctuation mark
                new_word = "".join(col)
            new_column.append(new_word)    
        return new_column

Instead I used <span style = "color:blue">not in</span> which was more effective and cleaner.

In [13]:
def remove_punc(column):
    new_column = []
    for word in column:
        col = []
        for letter in word:
            if letter not in string.punctuation:
                col.append(letter) #list for individual letters now without punctuation mark
            new_word = "".join(col)
        new_column.append(new_word)    
    return new_column

In [14]:
No_Punc_Eng = remove_punc(Eng)

In [15]:
No_Punc_Fre = remove_punc(Fre)

In [16]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Bildad
[nltk_data]     Otieno/nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [17]:
tokenized_Eng = [nltk.word_tokenize(word) for word in No_Punc_Eng]
len(tokenized_Eng)

175621

In [18]:
tokenized_Fre = [nltk.word_tokenize(word) for word in No_Punc_Fre]
len(tokenized_Fre)

175621

In [19]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Bildad
[nltk_data]     Otieno/nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [20]:
#Verifying if that we have English and French Stopwords
from nltk.corpus import stopwords
#stopwords.fileids()

In [21]:
stop_Eng = stopwords.words('english') #179 of them

In [22]:
stop_Fre = stopwords.words('french') #157 of them

In [23]:
No_Stop_Eng = []
for word in tokenized_Eng:
    if word not in stop_Eng:
        No_Stop_Eng.append(word)

In [24]:
len(No_Stop_Eng)

175621

In [25]:
No_Stop_Fre = []
for word in tokenized_Fre:
    if word not in stop_Fre:
        No_Stop_Fre.append(word)

In [26]:
len(No_Stop_Fre)

175621

In [27]:
lower_Fre = []
for words in No_Stop_Fre:
    for word in words:
        lower_Fre.append(word.lower())

In [28]:
lower_Eng = []
for words in No_Stop_Eng:
    for word in words:
        lower_Eng.append(word.lower())

I will opt for lemmatization and not stemming as I did before:


    ps = PorterStemmer()
    print(" {0:25}  {1:25} ".format("--Word(s)--","--Stem--"))
    for word in lower_Eng:
        print("   {0:25}  {1:25} ".format(word,ps.stem(word)))


In [29]:
# nltk.download('all') - Every Package is Up-to-date for my Ellie

In [30]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to C:\Users\Bildad
[nltk_data]     Otieno/nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [31]:
wnl = WordNetLemmatizer()

In [38]:
nltk.download('wordnet')
'''print(" {0:25}  {1:25} ".format("--Word(s)--","--Lemma--"))
for word in lower_Fre:
    print("   {0:25}  {1:25} ".format(word, wnl.lemmatize(word, pos='v')))'''
    
lemm_Eng = [wnl.lemmatize(word, pos='v') for word in lower_Eng]

[nltk_data] Downloading package wordnet to C:\Users\Bildad
[nltk_data]     Otieno/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [33]:
#nlp = spacy.load()
#for word in lower_Fre:

In [44]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "This is an example sentence."
#col = list(nlp.pipe(lower_Eng))



docs = [nlp(text) for text in lower_Eng]

# Accessing sentence-level information
'''for sent in col.sents:
    print(sent.text)'''


In [3]:
!python -m spacy download fr_core_news_md

Collecting fr-core-news-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.5.0/fr_core_news_md-3.5.0-py3-none-any.whl (45.8 MB)
     -------------------------------------- 45.8/45.8 MB 309.6 kB/s eta 0:00:00
Installing collected packages: fr-core-news-md
Successfully installed fr-core-news-md-3.5.0
✔ Download and installation successful
You can now load the package via spacy.load('fr_core_news_md')


In [4]:
import spacy
nlp = spacy.load('fr_core_news_md')

doc = nlp(u"voudrais non animaux yeux dors couvre.")
for token in doc:
    print(token, token.lemma_)

voudrais vouloir
non non
animaux animal
yeux yeux
dors dormir
couvre couvrir
. .


## Splitting Dataset into 70:30 Ratio

In [34]:
Eng_train, Eng_test, Fre_train, Fre_test = train_test_split(Eng, Fre, test_size= .33, random_state=42)