# Loading Data Functions

In [None]:
!pip install pandas nltk spacy langdetect tashaphyne pymongo demoji

In [None]:
!python -m spacy download fr_core_news_sm

In [None]:
from pymongo import MongoClient
import pandas as pd
import nltk
import spacy
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem.isri import ISRIStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
import demoji
import string
from tashaphyne.stemming import ArabicLightStemmer

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
demoji.download_codes()

In [None]:
stemmer_en = PorterStemmer()
stemmer_fr = SnowballStemmer('french')
stemmer_ar = ISRIStemmer()
lemmatizer_en = WordNetLemmatizer()
ar_stemmer = ArabicLightStemmer()
nlp_en = spacy.load('en_core_web_sm')
nlp_fr = spacy.load('fr_core_news_sm')
nltk.download('averaged_perceptron_tagger')

## Loading Data

In [None]:
def load_data():
    # Settings
    username = "mlteam"
    password = "mlteam"
    database_name = "TweetsDataBase"
    collection_name = "TweetsData"
    uri = f"mongodb+srv://{username}:{password}@cluster0.6y3bpz0.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

    client = MongoClient(uri)
    db = client[database_name]
    collection = db[collection_name]
    documents = list(collection.find({}, {'full_text': 1, 'lang':1,'topic': 1, '_id': 0}))

    df = pd.DataFrame(documents)
    df.rename(columns={'full_text': 'tweet'}, inplace=True)

    return df

#Preprocessing Functions

## Sentence Segmentation

In [None]:
!pip install farasapy

Collecting farasapy
  Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Installing collected packages: farasapy
Successfully installed farasapy-0.0.14


In [None]:
#from farasa.segmenter import FarasaSegmenter

In [None]:
# Initialize Farasa Segmenter (only if you are using it)
#farasa_segmenter = FarasaSegmenter(interactive=True)

In [None]:
def sentence_segmentation(text, lang):
    if lang == 'en':
        return nltk.sent_tokenize(text)
    elif lang == 'fr':
        return nltk.sent_tokenize(text)
    elif lang == 'ar':
        # Using Farasa for Arabic sentence segmentation if available
        return farasa_segmenter.segment(text)
    else:
        # Fallback to English sentence tokenization
        return nltk.sent_tokenize(text)

## Dependency Parsing

In [None]:
def dependency_parsing(tokens, lang):
    if lang == 'en':
        text = ' '.join(tokens)
        doc = nlp_en(text)
        return [(token.text, token.dep_, token.head.text) for token in doc]
    elif lang == 'fr':
        text = ' '.join(tokens)
        doc = nlp_fr(text)
        return [(token.text, token.dep_, token.head.text) for token in doc]
    elif lang == 'ar':
        return []  # Dependency parsing for Arabic not supported by spaCy
    return []

## Part-of-Speech Tagging

In [None]:
def pos_tagging(tokens, lang):
    if lang == 'en':
        return nltk.pos_tag(tokens)
    elif lang == 'fr':
        return [(token.text, token.pos_) for token in nlp_fr(' '.join(tokens))]
    elif lang == 'ar':
        return [(token, 'N/A') for token in tokens]  # Custom POS tagging for Arabic
    return []

## Data Quality

In [None]:
def data_quality(data_frame):
  data_frame['tweet'] = data_frame['tweet'].str.lower()

  filter_condition_eco = (
      (data_frame['topic'] == 'Economy') &
      (
          data_frame['tweet'].str.contains('eco') |
          data_frame['tweet'].str.contains('invest') |
          data_frame['tweet'].str.contains('قتص') |
          data_frame['tweet'].str.contains('مال') |
          data_frame['tweet'].str.contains('تجار')
      )
  )
  # Update the original DataFrame to keep only the filtered data for 'Economy' topic
  data_frame = data_frame.loc[~(data_frame['topic'] == 'Economy') | filter_condition_eco]

  filter_condition_politics = (
      (data_frame['topic'] == 'Politics') &
      (
          data_frame['tweet'].str.contains('polit') |
          data_frame['tweet'].str.contains('سياس') |
          data_frame['tweet'].str.contains('حكو')
      )
  )

  # Update the original DataFrame to keep only the filtered data for 'Economy' topic
  data_frame = data_frame.loc[~(data_frame['topic'] == 'Politics') | filter_condition_politics]

  filter_condition_tourism = (
      (data_frame['topic'] == 'Tourism') &
      (
          data_frame['tweet'].str.contains('touri') |
          data_frame['tweet'].str.contains('سياح')
      )
  )
  # Update the original DataFrame to keep only the filtered data for 'Economy' topic
  data_frame = data_frame.loc[~(data_frame['topic'] == 'Tourism') | filter_condition_tourism]

  filter_condition_techno = (
      (data_frame['topic'] == 'Technology') &
      (
          data_frame['tweet'].str.contains('techno') |
          data_frame['tweet'].str.contains('تكنو')
      )
  )

  # Update the original DataFrame to keep only the filtered data for '' topic
  data_frame = data_frame.loc[~(data_frame['topic'] == 'Technology') | filter_condition_techno]

  return data_frame

## Cleaning Text

In [None]:
def clean_text(text):
    #remove hyperlinks
    tweet = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove tags (@xxxxx)
    tweet = re.sub(r'@\w+', '', tweet)
    # Remove special characters
    tweet = re.sub(r'\W', ' ', tweet)
    # Remove emojis
    tweet = demoji.replace(tweet, '')
    # Remove punctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remove digits
    tweet = re.sub(r'[\W\d٠١٢٣٤٥٦٧٨٩]', ' ', tweet)
    # Lowercase
    return tweet.lower()

## Word Tokenization

In [None]:
def word_tokenization(text, lang):
    if lang == 'en':
        return word_tokenize(text)
    elif lang == 'fr':
        return word_tokenize(text)
    elif lang == 'ar':
        #arabic_tokens = re.findall(r'\b[\w\']+\b', text, re.UNICODE)
        return word_tokenize(text)#arabic_tokens
    else:
        # Fallback to English
        return word_tokenize(text)

## Stemming and Lemmatization

In [None]:
def stemming(tokens, lang):
    if lang == 'en':
        return [stemmer_en.stem(token) for token in tokens]
    elif lang == 'fr':
        return [stemmer_fr.stem(token) for token in tokens]
    elif lang == 'ar':
        return [stemmer_ar.stem(token) for token in tokens]
    return tokens

In [None]:
def lemmatization(tokens, lang):
    if lang == 'en':
        tokens = word_tokenize(tokens)
        return [lemmatizer_en.lemmatize(word) for word in tokens]
    elif lang == 'fr':
        doc = nlp_fr(tokens)
        return [token.lemma_ for token in doc if token.is_alpha]
    elif lang == 'ar':
        tokens = word_tokenize(tokens)
        return [ar_stemmer.light_stem(word) for word in tokens]
    return [lemmatizer_en.lemmatize(token) for token in tokens]

## Stop Word Analysis

In [None]:
stop_words_en = set(stopwords.words('english'))
stop_words_fr = set(stopwords.words('french'))
stop_words_ar = set(stopwords.words('arabic'))

In [None]:
def remove_stop_words(tokens, lang):
    if lang == 'en':
        return [token for token in tokens if token.lower() not in stop_words_en]
    elif lang == 'fr':
        return [token for token in tokens if token.lower() not in stop_words_fr]
    elif lang == 'ar':
        return [token for token in tokens if token not in stop_words_ar]

## Remove Rare Words

In [None]:
def remove_rare_frequent_words(df):
    # Count the word occurrences
    word_counts = Counter(word for tweet in df['tweet'] for word in tweet.split())

    # Identify rare and too frequent words
    total_tweets = len(df)
    rare_words = set(word for word, count in word_counts.items() if count <= 2) # Test This without rare words
    frequent_words = set(word for word, count in word_counts.items() if count >= total_tweets * 0.95)
    words_to_remove = rare_words | frequent_words

    # Remove rare and too frequent words from tweets
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in words_to_remove]))

    return df

## Balance Data

In [None]:
def balance_data(data):
  return pd.concat([data[data['topic'] == topic].sample(n=data['topic'].value_counts().min(), random_state=42) for topic in data['topic'].unique()])

## Clean DataFrame

In [None]:
def clean_dataframe(data):
    data['tweet'] = data['tweet'].str.lower()
    data = data[(data['lang'].isin(['fr', 'ar', 'en']))]
    data.drop_duplicates(subset=['tweet'],inplace=True)
    data.dropna(subset=['tweet'],inplace=True)
    data = data[data['tweet'] != '']
    return data

#Preprocessing Pipeline

## Pipeline Function

In [None]:
def data_processing_pipeline(data):

    #Data Quality
    data = data_quality(data)
    #Clean DataFrame
    data = clean_dataframe(data)
    #Clean Tweets
    data['tweet'] = data['tweet'].apply(clean_text)
    #Tokenization
    data['tweet'] = data.apply(lambda row: word_tokenization(row['tweet'], row['lang']), axis=1)
    #Remove Stop Words
    data['tweet'] = data.apply(lambda row: remove_stop_words(row['tweet'], row['lang']), axis=1)
    # Join tokens back into a single string
    data['tweet'] = data['tweet'].apply(lambda tokens: ' '.join(tokens))
    #Clean DataFrame
    data = clean_dataframe(data)
    #Lemmatization
    data['tweet'] = data.apply(lambda row: lemmatization(row['tweet'], row['lang']), axis=1)
    #Reconstruct the tweet from lemmatized tokens
    data['tweet'] = data['tweet'].apply(lambda tokens: ' '.join(tokens))
    #Remove Rare and Frequent Words
    data = remove_rare_frequent_words(data)
    #Clean DataFrame
    data = clean_dataframe(data)
    #Balance Data
    data = balance_data(data)

    # Check if 'topic' column exists
    if 'topic' in data.columns:
        # Return 'topic' and 'tweet' columns
        return data[['tweet','lang','topic']]
    else:
        # Return only 'tweet' column
        return data['tweet']

## Execute The pipeline

In [None]:
data_frame = load_data()

In [None]:
processed_data = data_processing_pipeline(data_frame)

In [None]:
processed_data.to_csv('tweetsData.csv',index=False)