In [1]:
import pandas as pd
import re

### Data Exploration
---


In [2]:
#Read file to do data exploration
FileData = pd.read_csv("tweets.csv")

In [3]:
# number of dataset before clean 
len(FileData)

10000

In [4]:
#number of dataset rows and columns 
print(FileData.shape)

(10000, 7)


In [5]:
#The columns title of dataset
FileData.columns

Index(['Unnamed: 0', 'ID', 'Tweet', 'Timestamp', 'Likes', 'Retweets',
       'Length'],
      dtype='object')

In [6]:
#each column's data type:
FileData.dtypes

Unnamed: 0     int64
ID             int64
Tweet         object
Timestamp     object
Likes          int64
Retweets       int64
Length         int64
dtype: object

In [7]:
#general Information about the dataset
FileData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  10000 non-null  int64 
 1   ID          10000 non-null  int64 
 2   Tweet       10000 non-null  object
 3   Timestamp   10000 non-null  object
 4   Likes       10000 non-null  int64 
 5   Retweets    10000 non-null  int64 
 6   Length      10000 non-null  int64 
dtypes: int64(5), object(2)
memory usage: 547.0+ KB


In [8]:
print(FileData.isnull().sum())

Unnamed: 0    0
ID            0
Tweet         0
Timestamp     0
Likes         0
Retweets      0
Length        0
dtype: int64


### Data Preprocessing
---


In [9]:
# Remove links, emojis, hashtags and mentions 
def remove_emojis_links_unrelated_chars(tweet):
    # convert hyperlinks to null
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', tweet)

    # replace mentions and hashtags with null
    tweet = re.sub('@[^\s]+', ' ', tweet)
    tweet = re.sub('#[^\s]+', ' ', tweet)
    
    # remove everything except alphanumeric
    tweet = re.sub('[/\W+/g]', ' ',  tweet)
    
    # remove new lines with white space
    tweet = tweet.replace('\n', ' ')
    tweet = tweet.replace('\t', ' ')
    tweet = tweet.replace('\r', ' ')

    return tweet

In [10]:
#This function apply all the cleaning functions we created to one tweet
def clean_up(tweet):
    tweet = remove_emojis_links_unrelated_chars(tweet)
    #Add functions here ...
    #tweet = function(tweet)
    return tweet

In [11]:
# To clean the tweet for every record in the dataset
FileData["Tweet"] = FileData["Tweet"].apply(lambda tweet: clean_up(tweet))

 Lastly Remove Duplicates Records

In [12]:
# To ensure that every duplicate will be removed we removed the duplicates After applyning all the cleaning

# Assess the number of duplicates in dataset
duplicateTweets = FileData[FileData.duplicated('Tweet')]

display(duplicateTweets.head())
print('The number of duplicates is ', len(duplicateTweets))

Unnamed: 0.1,Unnamed: 0,ID,Tweet,Timestamp,Likes,Retweets,Length
13,13,1579448805833199616,كل عام وانتم بخير مسترمندوب Mo11 مرسول ...,2022-10-10 12:28:34+00:00,4,6,162
15,15,1579448604929839105,كل عام وانتم بخير مسترمندوب Ha7 مرسول Ha7 ...,2022-10-10 12:27:46+00:00,3,2,105
24,24,1579447495436173312,w66 مرسول جاهز هنقرستيشن ...,2022-10-10 12:23:22+00:00,0,0,135
25,25,1579447492789563392,لا تفوت كود تويو مثل هنقرستيشن يرجع 15 50 اول ...,2022-10-10 12:23:21+00:00,0,0,86
26,26,1579447262455169024,لا تفوت كود تويو مثل هنقرستيشن يرجع 15 50 اول ...,2022-10-10 12:22:26+00:00,0,0,86


The number of duplicates is  5204


In [13]:
# Remove duplicates from dataset

Final_tweets = FileData.drop_duplicates('Tweet').reset_index(drop=True).drop(['Unnamed: 0'], axis=1)

display(Final_tweets.head())
display(Final_tweets.tail())
print("Length after removing duplicates", len(Final_tweets))


Unnamed: 0,ID,Tweet,Timestamp,Likes,Retweets,Length
0,1579452840237404161,وش رايك فيني و انا أضبطك مع هنقرستيشن بيجي...,2022-10-10 12:44:36+00:00,0,0,165
1,1579452685044289536,OWn مرسول جاهز هنقر...,2022-10-10 12:43:59+00:00,0,0,108
2,1579452455254777857,ادخل الاقي التوصيل ب9 ريال مالقيت بدون سعر ...,2022-10-10 12:43:04+00:00,0,0,64
3,1579452331099566081,كل عام وانتم بخير مسترمندوب OWn مرسول ج...,2022-10-10 12:42:35+00:00,0,0,89
4,1579450892356812801,مافيه رقم طلب فيه فلوس اخذتوها من حسابي,2022-10-10 12:36:52+00:00,0,0,54


Unnamed: 0,ID,Tweet,Timestamp,Likes,Retweets,Length
4791,1577676909361725440,كل لعبه تسحيب,2022-10-05 15:07:41+00:00,0,0,45
4792,1577676733758799877,هنقرستيشن اسوء شي,2022-10-05 15:06:59+00:00,0,0,17
4793,1577676699847761921,بالعكس والله التطبيق فوق الممتاز وخدمني كث...,2022-10-05 15:06:51+00:00,0,0,155
4794,1577676665664192515,اخيس تطبيق توصيل هنقرستيشن اسحبوا عليهم ...,2022-10-05 15:06:43+00:00,0,0,152
4795,1577676595506159619,تم يارب من نصيبي,2022-10-05 15:06:26+00:00,0,0,31


Length after removing duplicates 4796


In [14]:
# save the final_tweets to .csv file
Final_tweets.to_csv('nnn.csv')