## GI5  <center> Normal Language Processing Classic Technics </center> Ammari Youssef <font float=right>  </font>

## <font color=red> Installation de spacy et nltk et Chargement de données du dataset 

In [1]:
import pandas as pd
import emoji
import re
import nltk
from nltk.corpus import stopwords

# nltk.download("stopwords") run this if you don't have stopwords 

# Load the dataset
df = pd.read_csv('sample.csv')
df.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...,119236.0,
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...,,119239.0
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...,119238.0,
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...,119241.0,119242.0
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...,119243.0,119240.0


## <font color=red> Functions that process texts 

In [2]:
#remove emojis 

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

#Cleanse the texts 
def preprocess_text(text):
    # Remove punctuation and non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra spaces
    text = ' '.join(text.split())

    # Remove common stop words
    stop_words = set(stopwords.words("english"))
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]

    text = ' '.join(words)

    return text

In [3]:
df['cleaned_text'] = df['text'].apply(deEmojify).apply(preprocess_text)
df['cleaned_text'][0]

'AppleSupport causing reply disregarded tapped notification keyboard opened'

## <font color=red> Functions of classic NLP Technics with Spacy & NLTK

In [4]:
#NLP technics 
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")
def tokenize_text(text):
    doc = nlp(text)
    return [token.text for token in doc]

def lemmatize_text(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc]

def stem_text(text):
    doc = nlp(text)
    return [token.text for token in doc]

In [5]:
import nltk
from nltk.stem import PorterStemmer

#stemming with nltk
stemmer = PorterStemmer()


In [6]:
# Apply tokenization
df["tokenized_text"] = df["cleaned_text"].apply(tokenize_text)

# Apply lemmatization
df["lemmatized_text"] = df["cleaned_text"].apply(lemmatize_text)

# Apply stemming (for illustration, spaCy doesn't provide a traditional stemmer)
df["stemmed_text"] = df["cleaned_text"].apply(stem_text)

df["nltk_stemmed_text"] = df["cleaned_text"].apply(lambda text: [stemmer.stem(word) for word in text.split()])

df[["text","cleaned_text","tokenized_text","lemmatized_text","stemmed_text","nltk_stemmed_text"]].head()

Unnamed: 0,text,cleaned_text,tokenized_text,lemmatized_text,stemmed_text,nltk_stemmed_text
0,@AppleSupport causing the reply to be disregar...,AppleSupport causing reply disregarded tapped ...,"[AppleSupport, causing, reply, disregarded, ta...","[AppleSupport, cause, reply, disregard, tap, n...","[AppleSupport, causing, reply, disregarded, ta...","[applesupport, caus, repli, disregard, tap, no..."
1,@105835 Your business means a lot to us. Pleas...,business means lot us Please DM name zip code ...,"[business, means, lot, us, Please, DM, name, z...","[business, mean, lot, we, please, DM, name, zi...","[business, means, lot, us, Please, DM, name, z...","[busi, mean, lot, us, pleas, dm, name, zip, co..."
2,@76328 I really hope you all change but I'm su...,really hope change Im sure wont dont,"[really, hope, change, I, m, sure, wo, nt, do,...","[really, hope, change, I, m, sure, wo, nt, do,...","[really, hope, change, I, m, sure, wo, nt, do,...","[realli, hope, chang, im, sure, wont, dont]"
3,@105836 LiveChat is online at the moment - htt...,LiveChat online moment httpstcoSYVtUKq contact...,"[LiveChat, online, moment, httpstcoSYVtUKq, co...","[livechat, online, moment, httpstcosyvtukq, co...","[LiveChat, online, moment, httpstcoSYVtUKq, co...","[livechat, onlin, moment, httpstcosyvtukq, con..."
4,@VirginTrains see attached error message. I've...,VirginTrains see attached error message Ive tr...,"[VirginTrains, see, attached, error, message, ...","[virgintrain, see, attach, error, message, I, ...","[VirginTrains, see, attached, error, message, ...","[virgintrain, see, attach, error, messag, ive,..."


## <font color=red>Exemple de deux enregistrements

In [7]:
def show(i):
    print("Text:\n")
    print(df.text[i],'\n')
    print("Cleaned Text (No punctuation nor emojis): \n")
    print(df.cleaned_text[i],'\n')
    print("Tokenization:\n")
    print(df.tokenized_text[i],'\n')
    print("Lemmatization:\n")
    print(df.lemmatized_text[i],'\n')
    print("Stemming:\n")
    print(df.stemmed_text[i],'\n')
    print("Stemming with nltk:\n")
    print(df.nltk_stemmed_text[i],'\n')
    

In [8]:
show(0)

Text:

@AppleSupport causing the reply to be disregarded and the tapped notification under the keyboard is opened😡😡😡 

Cleaned Text (No punctuation nor emojis): 

AppleSupport causing reply disregarded tapped notification keyboard opened 

Tokenization:

['AppleSupport', 'causing', 'reply', 'disregarded', 'tapped', 'notification', 'keyboard', 'opened'] 

Lemmatization:

['AppleSupport', 'cause', 'reply', 'disregard', 'tap', 'notification', 'keyboard', 'open'] 

Stemming:

['AppleSupport', 'causing', 'reply', 'disregarded', 'tapped', 'notification', 'keyboard', 'opened'] 

Stemming with nltk:

['applesupport', 'caus', 'repli', 'disregard', 'tap', 'notif', 'keyboard', 'open'] 



In [9]:
show(1)

Text:

@105835 Your business means a lot to us. Please DM your name, zip code and additional details about your concern. ^RR https://t.co/znUu1VJn9r 

Cleaned Text (No punctuation nor emojis): 

business means lot us Please DM name zip code additional details concern RR httpstcoznUuVJnr 

Tokenization:

['business', 'means', 'lot', 'us', 'Please', 'DM', 'name', 'zip', 'code', 'additional', 'details', 'concern', 'RR', 'httpstcoznUuVJnr'] 

Lemmatization:

['business', 'mean', 'lot', 'we', 'please', 'DM', 'name', 'zip', 'code', 'additional', 'detail', 'concern', 'RR', 'httpstcoznUuVJnr'] 

Stemming:

['business', 'means', 'lot', 'us', 'Please', 'DM', 'name', 'zip', 'code', 'additional', 'details', 'concern', 'RR', 'httpstcoznUuVJnr'] 

Stemming with nltk:

['busi', 'mean', 'lot', 'us', 'pleas', 'dm', 'name', 'zip', 'code', 'addit', 'detail', 'concern', 'rr', 'httpstcoznuuvjnr'] 



# <center> Fin