In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from textblob import TextBlob
import string

[nltk_data] Downloading package stopwords to /Users/dbm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/dbm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

### Data Exploration
---


In [3]:
#Read file to do data exploration
FileData = pd.read_csv("tweets.csv")

In [4]:
# number of dataset before clean 
len(FileData)

10000

In [5]:
#number of dataset rows and columns 
print(FileData.shape)

(10000, 7)


In [6]:
#The columns title of dataset
FileData.columns

Index(['Unnamed: 0', 'ID', 'Tweet', 'Timestamp', 'Likes', 'Retweets',
       'Length'],
      dtype='object')

In [7]:
#each column's data type:
FileData.dtypes

Unnamed: 0     int64
ID             int64
Tweet         object
Timestamp     object
Likes          int64
Retweets       int64
Length         int64
dtype: object

In [8]:
#general Information about the dataset
FileData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  10000 non-null  int64 
 1   ID          10000 non-null  int64 
 2   Tweet       10000 non-null  object
 3   Timestamp   10000 non-null  object
 4   Likes       10000 non-null  int64 
 5   Retweets    10000 non-null  int64 
 6   Length      10000 non-null  int64 
dtypes: int64(5), object(2)
memory usage: 547.0+ KB


In [9]:
print(FileData.isnull().sum())

Unnamed: 0    0
ID            0
Tweet         0
Timestamp     0
Likes         0
Retweets      0
Length        0
dtype: int64


In [10]:
final_before_cleaning = FileData.copy()

### Data Preprocessing
---


#### 1. Drop tweets that contain coupon word in them

In [11]:
# Drop tweets that contain coupon word in them

FileData = FileData[FileData["Tweet"].str.contains("هنقرستيشن_كود|كود|كود خصم|كوبون") == False].reset_index(drop=True)

display(FileData.head())
display(FileData.tail())
print("The length after dropping tweets that contain coupon word is", len(FileData))

Unnamed: 0.1,Unnamed: 0,ID,Tweet,Timestamp,Likes,Retweets,Length
0,0,1579452840237404161,@HungerStation اتعب وانا اقول افضل برنامج عرفته للتوصيل مافى مره شكيت من شئ الا عوضوني ودائماً خدمتهم من أفضل لأفضل شكراً هنقرستيشن 😻,2022-10-10 12:44:36+00:00,0,0,165
1,1,1579452685044289536,@مسترمندوب 🌺 OWn 🌺 \nمرسول🍠🥡🍠🥡🍠\nجاهز🦞🦀🦐🦞🦀\nهنقرستيشن🦐🦞🥡\nالي عشاق االجمبرى🦞🦀🦐 https://t.co/mTpQL4eXyh,2022-10-10 12:43:59+00:00,0,0,108
2,2,1579452455254777857,@HungerStation ادخل الاقي التوصيل ب9 ريال! مالقيت بدون سعر توصيل,2022-10-10 12:43:04+00:00,0,0,64
3,3,1579452331099566081,كل عام وانتم بخير\nمسترمندوب 🌷🌛OWn 🌷\nمرسول🍿💖🍿\nجاهز🦋🌼🦋\nهنقرستيشن🌾🌼🌷 https://t.co/Qv5FsJBG30,2022-10-10 12:42:35+00:00,0,0,89
4,4,1579450892356812801,@HungerStation مافيه رقم طلب فيه فلوس اخذتوها من حسابي,2022-10-10 12:36:52+00:00,0,0,54


Unnamed: 0.1,Unnamed: 0,ID,Tweet,Timestamp,Likes,Retweets,Length
8522,9995,1577676665664192515,@Dr_ksa2020 @HungerStation @MCgovSA اخيس تطبيق توصيل هنقرستيشن اسحبوا عليهم جاهز ممتاز 👍🏼 ولا حطوا الدفع عند الاستلام طحت ب مناديب من تطبيقهم مالهم امان,2022-10-05 15:06:43+00:00,0,0,152
8523,9996,1577676664233934850,@HungerStation @_waheed20,2022-10-05 15:06:43+00:00,0,0,25
8524,9997,1577676648870301697,@HungerStation .\n.\n@ro0o2005 😁😁,2022-10-05 15:06:39+00:00,2,1,32
8525,9998,1577676622433509378,@HungerStation #منشن_اللي \n\n❤️❤️❤️❤️❤️❤️,2022-10-05 15:06:33+00:00,0,0,43
8526,9999,1577676595506159619,@HungerStation تم يارب من نصيبي,2022-10-05 15:06:26+00:00,0,0,31


The length after dropping tweets that contain coupon word is 8527


In [12]:
before_cleaning = FileData.copy()

#### 2. Remove unneeded hashtags, new lines, tabs, mentions, links, emojis and english words

In [13]:
def processText(tweet):
    # replace @username and #hashtag with empty string
    tweet = re.sub('@[^\s]+', ' ', tweet)
    tweet = re.sub('#[^\s]+', ' ', tweet)

    # remove everything except alphanumeric
    tweet = re.sub('[/\W+/g]', ' ',  tweet)
    
    # convert www.* or https?://* to " "
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', tweet)
    
    # remove english words
    tweet = re.sub(r'\s*[A-Za-z]+\b', ' ', tweet).rstrip()
    tweet = "".join([char for char in tweet if char not in string.ascii_letters]).strip()

    # remove new lines with white space
    tweet = tweet.replace('\n', ' ')
    tweet = tweet.replace('\t', ' ')

    return tweet

In [14]:
# Run and display processText method
FileData["Tweet"] = FileData["Tweet"].apply(lambda tweet: processText(tweet))

#before
display(before_cleaning.loc[[0]])
#After
display(FileData.loc[[0]])

Unnamed: 0.1,Unnamed: 0,ID,Tweet,Timestamp,Likes,Retweets,Length
0,0,1579452840237404161,@HungerStation اتعب وانا اقول افضل برنامج عرفته للتوصيل مافى مره شكيت من شئ الا عوضوني ودائماً خدمتهم من أفضل لأفضل شكراً هنقرستيشن 😻,2022-10-10 12:44:36+00:00,0,0,165


Unnamed: 0.1,Unnamed: 0,ID,Tweet,Timestamp,Likes,Retweets,Length
0,0,1579452840237404161,اتعب وانا اقول افضل برنامج عرفته للتوصيل مافى مره شكيت من شئ الا عوضوني ودائما خدمتهم من أفضل لأفضل شكرا هنقرستيشن,2022-10-10 12:44:36+00:00,0,0,165


#### 3. Remove stop words

In [15]:
ar_stops = set(stopwords.words('arabic'))
stop_words = {"،","","ورحمه","وبركاته","عليكم","السلام","آض","آمينَ","آه","آهاً","آي","أ","أب","أجل","أجمع","أخ","أخذ","أصبح","أضحى","أقبل",
"أقل","أكثر","ألا","أم","أما","أمامك","أمامكَ","أمسى","أمّا","أن","أنا","انا","أنت","أنتم","أنتما","أنتن","أنتِ","أنشأ","أنّى","أو","أوشك","أولئك",
"أولئكم","أولاء","أولالك","أوّهْ","أي","أيا","أين","أينما","أيّ","أَنَّ","أََيُّ","أُفٍّ","إذ","إذا","إذاً","إذما","إذن","إلى","إليكم","إليكما","إليكنّ",
"إليكَ","إلَيْكَ","إلّا","إمّا","إن","إنّما","إي","إياك","إياكم","إياكما","إياكن","إيانا","إياه","إياها","إياهم","إياهما","إياهن","إياي","إيهٍ","إِنَّ","ا","ابتدأ",
"اثر","اجل","احد","اخرى","اخلولق","اذا","اربعة","ارتدّ","استحال","اطار","اعادة","اعلنت","اف","اكثر","اكد","الألاء","الألى","الا","الاخيرة","الان",
"الاول","الاولى","التى","التي","الثاني","الثانية","الذاتي","الذى","الذي","الذين","السابق","الف","اللائي","اللاتي","اللتان","اللتيا","اللتين",
"اللذان","اللذين","اللواتي","الماضي","المقبل","الوقت","الى","اليوم","اما","امام","امس","ان","انبرى","انقلب","انه","انها","او","اول","اي",
"ايار","ايام","ايضا","ب","بات","باسم","بان","بخٍ","برس","بسبب","بسّ","بشكل","بضع","بطآن","بعد","بعض","بك","بكم","بكما","بكن","بل","بلى","بما",
"بماذا","بمن","بن","بنا","به","بها","بي","بيد","بين","بَسْ","بَلْهَ","بِئْسَ","تانِ","تانِك","تبدّل","تجاه","تحوّل","تلقاء","تلك","تلكم","تلكما","تم",
"تينك","تَيْنِ","تِه","تِي","ثلاثة","ثم","ثمّ","ثمّة","ثُمَّ","جعل","جلل","جميع","جير","حار","حاشا","حاليا","حاي","حتى","حرى","حسب","حم","حوالى","حول",
"حيث","حيثما","حين","حيَّ","حَبَّذَا","حَتَّى","حَذارِ","خلا","خلال","دون","دونك","ذا","ذات","ذاك","ذانك","ذانِ","ذلك","ذلكم","ذلكما","ذلكن","ذو","ذوا",
"ذواتا","ذواتي","ذيت","ذينك","ذَيْنِ","ذِه","ذِي","راح","رجع","رويدك","ريث","رُبَّ","زيارة","سبحان","سرعان","سنة","سنوات","سوف","سوى","سَاءَ","سَاءَمَا",
"شبه","شخصا","شرع","شَتَّانَ","صار","صباح","صفر","صهٍ","صهْ","ضمن","طاق","طالما","طفق","طَق","ظلّ","عاد","عام","عاما","عامة","عدا","عدة","عدد","عدم",
"عسى","عشر","عشرة","علق","على","عليك","عليه","عليها","علًّ","عن","عند","عندما","عوض","عين","عَدَسْ","عَمَّا","غدا","غير","ـ","ف","فان","فلان","فو",
"فى","في","فيم","فيما","فيه","فيها","قال","قام","قبل","قد","قطّ","قلما","قوة","كأنّما","كأين","كأيّ","كأيّن","كاد","كان","كانت","كذا","كذلك","كرب",
"كل","كلا","كلاهما","كلتا","كلم","كليكما","كليهما","كلّما","كلَّا","كم","كما","كي","كيت","كيف","كيفما","كَأَنَّ","كِخ","لئن","لا","لات","لاسيما","لدن","لدى",
"لعمر","لقاء","لك","لكم","لكما","لكن","لكنَّما","لكي","لكيلا","للامم","لم","لما","لمّا","لن","لنا","له","لها","لو","لوكالة","لولا","لوما","لي","لَسْتَ",
"لَسْتُ","لَسْتُم","لَسْتُمَا","لَسْتُنَّ","لَسْتِ","لَسْنَ","لَعَلَّ","لَكِنَّ","لَيْتَ","لَيْسَ","لَيْسَا","لَيْسَتَا","لَيْسَتْ","لَيْسُوا","لَِسْنَا","ما","ماانفك","مابرح","مادام","ماذا",
"مازال","مافتئ","مايو","متى","مثل","مذ","مساء","معاذ","مقابل","مكانكم","مكانكما","مكانكنّ","مكانَك","مليار","مليون","مما","ممن","من","منذ",
"منها","مه","مهما","مَنْ","مِن","نحن","نحو","نعم","نفس","نفسه","نهاية","نَخْ","نِعِمّا","نِعْمَ","ها","هاؤم","هاكَ","هاهنا","هبّ","هذا","هذه","هكذا",
"هل","هلمَّ","هلّا","هم","هما","هن","هنا","هناك","هنالك","هو","هي","هيا","هيت","هيّا","هَؤلاء","هَاتانِ","هَاتَيْنِ","هَاتِه","هَاتِي","هَجْ","هَذا","هَذانِ","هَذَيْنِ",
"هَذِه","هَذِي","هَيْهَاتَ","و","وا","واحد","واضاف","واضافت","واكد","وان","واهاً","واوضح","وراءَك","وفي","وقال","وقالت","وقد","وقف","وكان","وكانت",
"ولا","ولم","ومن","مَن","وهو","وهي","ويكأنّ","وَيْ","وُشْكَانََ","يكون","يمكن","يوم","ّأيّان"}

def remove_stop_words(tweet):
    zen = TextBlob(tweet)
    words = zen.words
    return " ".join([w for w in words if not w in ar_stops and not w in stop_words and len(w) >= 2])

In [16]:
# Run and display remove_stop_words method
FileData["Tweet"] = FileData["Tweet"].apply(lambda tweet: remove_stop_words(tweet))

#before
display(before_cleaning.loc[[19]])
#After
display(FileData.loc[[19]])

Unnamed: 0.1,Unnamed: 0,ID,Tweet,Timestamp,Likes,Retweets,Length
19,19,1579447928149934083,انا عندي 10 طلبات توصيل مجانا بس اذا جيت اطلب يحط لي سعر التوصيل كيف كذا !,2022-10-10 12:25:05+00:00,0,0,89


Unnamed: 0.1,Unnamed: 0,ID,Tweet,Timestamp,Likes,Retweets,Length
19,19,1579447928149934083,عندي 10 طلبات توصيل مجانا جيت اطلب يحط سعر التوصيل,2022-10-10 12:25:05+00:00,0,0,89


#### 4. Lastly Remove Duplicates Records

In [17]:
# Assess the number of duplicates in dataset
FileData = FileData[FileData['Tweet'].astype(bool)].reset_index(drop=True)                                                                                                                
duplicateTweets = FileData[FileData.duplicated('Tweet')]

display(duplicateTweets.head())
print('The number of duplicates is ', len(duplicateTweets))

Unnamed: 0.1,Unnamed: 0,ID,Tweet,Timestamp,Likes,Retweets,Length
7,7,1579449391806820353,وانتم بخير مسترمندوب مرسول جاهز هنقرستيشن,2022-10-10 12:30:54+00:00,2,1,64
8,8,1579449246310596609,وانتم بخير مسترمندوب مرسول جاهز هنقرستيشن,2022-10-10 12:30:19+00:00,3,2,65
9,9,1579449245593063425,وانتم بخير مسترمندوب مرسول جاهز هنقرستيشن,2022-10-10 12:30:19+00:00,2,1,105
10,10,1579449014130704384,مرسول جاهز هنقرستيشن الي عشاق االجمبرى,2022-10-10 12:29:24+00:00,3,2,84
12,12,1579448885210071041,وانتم بخير مسترمندوب مرسول جاهز هنقرستيشن,2022-10-10 12:28:53+00:00,3,2,105


The number of duplicates is  4369


In [18]:
# To ensure that every duplicate will be removed, we removed the duplicates After applyning all the cleaning

# Remove duplicates from dataset

Final_tweets = FileData.drop_duplicates('Tweet').reset_index(drop=True).drop(['Unnamed: 0'], axis=1)

display(Final_tweets.head())
display(Final_tweets.tail())
print("Length after removing duplicates", len(Final_tweets))

Unnamed: 0,ID,Tweet,Timestamp,Likes,Retweets,Length
0,1579452840237404161,اتعب وانا اقول افضل برنامج عرفته للتوصيل مافى مره شكيت شئ عوضوني ودائما خدمتهم أفضل لأفضل شكرا هنقرستيشن,2022-10-10 12:44:36+00:00,0,0,165
1,1579452685044289536,مرسول جاهز هنقرستيشن الي عشاق االجمبرى,2022-10-10 12:43:59+00:00,0,0,108
2,1579452455254777857,ادخل الاقي التوصيل ب9 مالقيت بدون سعر توصيل,2022-10-10 12:43:04+00:00,0,0,64
3,1579452331099566081,وانتم بخير مسترمندوب مرسول جاهز هنقرستيشن 530,2022-10-10 12:42:35+00:00,0,0,89
4,1579450892356812801,مافيه رقم طلب فلوس اخذتوها حسابي,2022-10-10 12:36:52+00:00,0,0,54


Unnamed: 0,ID,Tweet,Timestamp,Likes,Retweets,Length
3765,1577676950411399168,اليينه انت,2022-10-05 15:07:51+00:00,0,0,36
3766,1577676733758799877,هنقرستيشن اسوء شي,2022-10-05 15:06:59+00:00,0,0,17
3767,1577676699847761921,بالعكس والله التطبيق الممتاز وخدمني كثير المطاعم تجيب الاكل بارد تغث هوا يخدم ويفك ازمه حبيتو مرررره شكرا هنقرستيشن,2022-10-05 15:06:51+00:00,0,0,155
3768,1577676665664192515,اخيس تطبيق توصيل هنقرستيشن اسحبوا عليهم جاهز ممتاز حطوا الدفع الاستلام طحت مناديب تطبيقهم مالهم امان,2022-10-05 15:06:43+00:00,0,0,152
3769,1577676595506159619,يارب نصيبي,2022-10-05 15:06:26+00:00,0,0,31


Length after removing duplicates 3770


#### Display the dataset before and after cleaning
---

In [19]:
display(final_before_cleaning.head())
display(final_before_cleaning.tail())

display(Final_tweets.head())
display(Final_tweets.tail())

Unnamed: 0.1,Unnamed: 0,ID,Tweet,Timestamp,Likes,Retweets,Length
0,0,1579452840237404161,@HungerStation اتعب وانا اقول افضل برنامج عرفته للتوصيل مافى مره شكيت من شئ الا عوضوني ودائماً خدمتهم من أفضل لأفضل شكراً هنقرستيشن 😻,2022-10-10 12:44:36+00:00,0,0,165
1,1,1579452685044289536,@مسترمندوب 🌺 OWn 🌺 \nمرسول🍠🥡🍠🥡🍠\nجاهز🦞🦀🦐🦞🦀\nهنقرستيشن🦐🦞🥡\nالي عشاق االجمبرى🦞🦀🦐 https://t.co/mTpQL4eXyh,2022-10-10 12:43:59+00:00,0,0,108
2,2,1579452455254777857,@HungerStation ادخل الاقي التوصيل ب9 ريال! مالقيت بدون سعر توصيل,2022-10-10 12:43:04+00:00,0,0,64
3,3,1579452331099566081,كل عام وانتم بخير\nمسترمندوب 🌷🌛OWn 🌷\nمرسول🍿💖🍿\nجاهز🦋🌼🦋\nهنقرستيشن🌾🌼🌷 https://t.co/Qv5FsJBG30,2022-10-10 12:42:35+00:00,0,0,89
4,4,1579450892356812801,@HungerStation مافيه رقم طلب فيه فلوس اخذتوها من حسابي,2022-10-10 12:36:52+00:00,0,0,54


Unnamed: 0.1,Unnamed: 0,ID,Tweet,Timestamp,Likes,Retweets,Length
9995,9995,1577676665664192515,@Dr_ksa2020 @HungerStation @MCgovSA اخيس تطبيق توصيل هنقرستيشن اسحبوا عليهم جاهز ممتاز 👍🏼 ولا حطوا الدفع عند الاستلام طحت ب مناديب من تطبيقهم مالهم امان,2022-10-05 15:06:43+00:00,0,0,152
9996,9996,1577676664233934850,@HungerStation @_waheed20,2022-10-05 15:06:43+00:00,0,0,25
9997,9997,1577676648870301697,@HungerStation .\n.\n@ro0o2005 😁😁,2022-10-05 15:06:39+00:00,2,1,32
9998,9998,1577676622433509378,@HungerStation #منشن_اللي \n\n❤️❤️❤️❤️❤️❤️,2022-10-05 15:06:33+00:00,0,0,43
9999,9999,1577676595506159619,@HungerStation تم يارب من نصيبي,2022-10-05 15:06:26+00:00,0,0,31


Unnamed: 0,ID,Tweet,Timestamp,Likes,Retweets,Length
0,1579452840237404161,اتعب وانا اقول افضل برنامج عرفته للتوصيل مافى مره شكيت شئ عوضوني ودائما خدمتهم أفضل لأفضل شكرا هنقرستيشن,2022-10-10 12:44:36+00:00,0,0,165
1,1579452685044289536,مرسول جاهز هنقرستيشن الي عشاق االجمبرى,2022-10-10 12:43:59+00:00,0,0,108
2,1579452455254777857,ادخل الاقي التوصيل ب9 مالقيت بدون سعر توصيل,2022-10-10 12:43:04+00:00,0,0,64
3,1579452331099566081,وانتم بخير مسترمندوب مرسول جاهز هنقرستيشن 530,2022-10-10 12:42:35+00:00,0,0,89
4,1579450892356812801,مافيه رقم طلب فلوس اخذتوها حسابي,2022-10-10 12:36:52+00:00,0,0,54


Unnamed: 0,ID,Tweet,Timestamp,Likes,Retweets,Length
3765,1577676950411399168,اليينه انت,2022-10-05 15:07:51+00:00,0,0,36
3766,1577676733758799877,هنقرستيشن اسوء شي,2022-10-05 15:06:59+00:00,0,0,17
3767,1577676699847761921,بالعكس والله التطبيق الممتاز وخدمني كثير المطاعم تجيب الاكل بارد تغث هوا يخدم ويفك ازمه حبيتو مرررره شكرا هنقرستيشن,2022-10-05 15:06:51+00:00,0,0,155
3768,1577676665664192515,اخيس تطبيق توصيل هنقرستيشن اسحبوا عليهم جاهز ممتاز حطوا الدفع الاستلام طحت مناديب تطبيقهم مالهم امان,2022-10-05 15:06:43+00:00,0,0,152
3769,1577676595506159619,يارب نصيبي,2022-10-05 15:06:26+00:00,0,0,31


### Save the Cleaned Data
---

In [20]:
Final_tweets.to_csv('final_tweets_cleaned.csv')