# Twitter Data Scraper

In [8]:
import pandas as pd
import json
import re

# Load the dataset
with open("dataset_easy-twitter-search-scraper_2024-10-10_01-56-24-056.json", "r", encoding="utf-8") as file:
    data = json.load(file)

In [9]:
# Convert the JSON data to a pandas DataFrame
df = pd.DataFrame(data)

# Display the first few rows of the dataset to understand its structure
print("Original Dataset:")
print(df.head())

Original Dataset:
                    id                                                url  \
0  1844183629653737485  https://twitter.com/IStopSpeed/status/18441836...   
1  1844171042832646341  https://twitter.com/kindalornso816/status/1844...   
2  1844034407159701602  https://twitter.com/WINTER_THAILAND/status/184...   
3  1844128223300551170  https://twitter.com/ugetnofunds/status/1844128...   
4  1844122745422159957  https://twitter.com/aliawan6421/status/1844122...   

   verified                 timestamp  \
0     False  2024-10-10T01:10:00.000Z   
1     False  2024-10-10T00:20:00.000Z   
2      True  2024-10-09T15:17:00.000Z   
3     False  2024-10-09T21:30:00.000Z   
4     False  2024-10-09T21:08:00.000Z   

                                                text  \
0  selagi gak masuk indonesia mah bodo amat.\nber...   
1  Gak sabar nunggu update selanjutnya tentang ak...   
2  การ์ดน้องหนาว จากอัลบั้ม aespa 에스파 The 5th Min...   
3                                    speed indon

In [11]:
# Here is columns of interest like 'username', 'content', 'date', etc.
df = df[['id', 'timestamp', 'text', 'searchQuery']]

In [12]:
# Convert 'date' column to datetime format for better time-based analysis
df['timestamp'] = pd.to_datetime(df['timestamp'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['timestamp'] = pd.to_datetime(df['timestamp'])


In [16]:
# Clean the 'text' column: remove URLs, hashtags, mentions, and unnecessary characters from the tweets
def clean_text(text):
    if not isinstance(text, str):
        text = ''
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove mentions
    text = re.sub(r'\@\w+|\#', '', text)
    # Remove non-alphanumeric characters
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['clean_content'] = df['text'].apply(clean_text)

# Display the first few rows of the cleaned dataset
print("Cleaned Dataset:")
print(df.head())

Cleaned Dataset:
                    id                 timestamp  \
0  1844183629653737485 2024-10-10 01:10:00+00:00   
1  1844171042832646341 2024-10-10 00:20:00+00:00   
2  1844034407159701602 2024-10-09 15:17:00+00:00   
3  1844128223300551170 2024-10-09 21:30:00+00:00   
4  1844122745422159957 2024-10-09 21:08:00+00:00   

                                                text       searchQuery  \
0  selagi gak masuk indonesia mah bodo amat.\nber...  speed Indonesia    
1  Gak sabar nunggu update selanjutnya tentang ak...  speed Indonesia    
2  การ์ดน้องหนาว จากอัลบั้ม aespa 에스파 The 5th Min...  speed Indonesia    
3                                    speed indonesia  speed Indonesia    
4  Gue banget yang ikutin perjalanan Speed di Ind...  speed Indonesia    

                                       clean_content  
0  selagi gak masuk indonesia mah bodo amat\nbera...  
1  Gak sabar nunggu update selanjutnya tentang ak...  
2  การดนองหนาว จากอลบม aespa 에스파 The 5th Mini Alb...  
3    

In [17]:
# Check for duplicates and remove them
df.drop_duplicates(subset=['clean_content'], inplace=True)

In [18]:
# Optionally, drop rows where the 'content' is empty after cleaning
df = df[df['clean_content'].str.strip() != '']

In [19]:
# Save the cleaned dataset for further analysis
df.to_csv("cleaned_twitter_data.csv", index=False)

# Display the first few rows of the cleaned dataset
print("Cleaned Dataset:")
print(df.head())


Cleaned Dataset:
                    id                 timestamp  \
0  1844183629653737485 2024-10-10 01:10:00+00:00   
1  1844171042832646341 2024-10-10 00:20:00+00:00   
2  1844034407159701602 2024-10-09 15:17:00+00:00   
3  1844128223300551170 2024-10-09 21:30:00+00:00   
4  1844122745422159957 2024-10-09 21:08:00+00:00   

                                                text       searchQuery  \
0  selagi gak masuk indonesia mah bodo amat.\nber...  speed Indonesia    
1  Gak sabar nunggu update selanjutnya tentang ak...  speed Indonesia    
2  การ์ดน้องหนาว จากอัลบั้ม aespa 에스파 The 5th Min...  speed Indonesia    
3                                    speed indonesia  speed Indonesia    
4  Gue banget yang ikutin perjalanan Speed di Ind...  speed Indonesia    

                                       clean_content  
0  selagi gak masuk indonesia mah bodo amat\nbera...  
1  Gak sabar nunggu update selanjutnya tentang ak...  
2  การดนองหนาว จากอลบม aespa 에스파 The 5th Mini Alb...  
3    

In [21]:
# count the number of tweets per search query
tweet_count = df['searchQuery'].value_counts()

# Display the number of tweets per search query
print("Tweet Count per Search Query:")
print(tweet_count)


Tweet Count per Search Query:
searchQuery
speed Indonesia     1053
Name: count, dtype: int64


# import data (english tweets) 

In [22]:
# Load english dataset
with open("dataset_easy-twitter-search-scraper_2024-10-10_02-26-27-098.json", "r", encoding="utf-8") as file:
    data = json.load(file)

In [23]:
# Convert the JSON data to a pandas DataFrame
df = pd.DataFrame(data)

# Display the first few rows of the dataset to understand its structure
print("Original Dataset:")
print(df.head())

Original Dataset:
                    id                                                url  \
0  1839287616509722900  https://twitter.com/SpeedUpdates1/status/18392...   
1  1844034158705983798  https://twitter.com/aucrum/status/184403415870...   
2  1844031182419497008  https://twitter.com/GarrySpeed47253/status/184...   
3  1844028133567345151  https://twitter.com/aucrum/status/184402813356...   
4  1836718975578690012  https://twitter.com/SpeedUpdates1/status/18367...   

   verified                                             images  \
0      True  [https://cdn.xcancel.comhttps://pbs.twimg.com/...   
1     False                                                NaN   
2     False                                                NaN   
3     False                                                NaN   
4      True                                                NaN   

                  timestamp  \
0  2024-09-26T12:55:00.000Z   
1  2024-10-09T15:16:00.000Z   
2  2024-10-09T15:04:00.000Z  

In [24]:
# Here is columns of interest like 'username', 'content', 'date', etc.
df = df[['id', 'timestamp', 'text', 'searchQuery']]

# Convert 'date' column to datetime format for better time-based analysis
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [25]:
# Clean the 'text' column: remove URLs, hashtags, mentions, and unnecessary characters from the tweets
def clean_text(text):
    if not isinstance(text, str):
        text = ''
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove mentions
    text = re.sub(r'\@\w+|\#', '', text)
    # Remove non-alphanumeric characters
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['clean_content'] = df['text'].apply(clean_text)

# Display the first few rows of the cleaned dataset
print("Cleaned Dataset:")
print(df.head())

Cleaned Dataset:
                    id                 timestamp  \
0  1839287616509722900 2024-09-26 12:55:00+00:00   
1  1844034158705983798 2024-10-09 15:16:00+00:00   
2  1844031182419497008 2024-10-09 15:04:00+00:00   
3  1844028133567345151 2024-10-09 14:52:00+00:00   
4  1836718975578690012 2024-09-19 10:48:00+00:00   

                                                text          searchQuery  \
0  Indonesian fan makes a memorial poster of Spee...  speed in Indonesia    
1  Minister Of Communication And Information, Bud...  speed in Indonesia    
2  So in your words you think he is the same as s...  speed in Indonesia    
3  Minister of Communication and Information, Bud...  speed in Indonesia    
4  Speed is currently just having fun & doing sid...  speed in Indonesia    

                                       clean_content  
0  Indonesian fan makes a memorial poster of Spee...  
1  Minister Of Communication And Information Budi...  
2  So in your words you think he is the sa

In [26]:
# Check for duplicates and remove them
df.drop_duplicates(subset=['clean_content'], inplace=True)

In [27]:
# Optionally, drop rows where the 'content' is empty after cleaning
df = df[df['clean_content'].str.strip() != '']


In [28]:
# Save the cleaned dataset for further analysis
df.to_csv("cleaned_twitter_data_english.csv", index=False)

# Display the first few rows of the cleaned dataset
print("Cleaned Dataset:")
print(df.head())

Cleaned Dataset:
                    id                 timestamp  \
0  1839287616509722900 2024-09-26 12:55:00+00:00   
1  1844034158705983798 2024-10-09 15:16:00+00:00   
2  1844031182419497008 2024-10-09 15:04:00+00:00   
3  1844028133567345151 2024-10-09 14:52:00+00:00   
4  1836718975578690012 2024-09-19 10:48:00+00:00   

                                                text          searchQuery  \
0  Indonesian fan makes a memorial poster of Spee...  speed in Indonesia    
1  Minister Of Communication And Information, Bud...  speed in Indonesia    
2  So in your words you think he is the same as s...  speed in Indonesia    
3  Minister of Communication and Information, Bud...  speed in Indonesia    
4  Speed is currently just having fun & doing sid...  speed in Indonesia    

                                       clean_content  
0  Indonesian fan makes a memorial poster of Spee...  
1  Minister Of Communication And Information Budi...  
2  So in your words you think he is the sa

In [30]:
# count the number of tweets per search query
tweet_count = df['searchQuery'].value_counts()
# Display the number of tweets per search query
print("Tweet Count per Search Query:")
print(tweet_count)

Tweet Count per Search Query:
searchQuery
speed in Indonesia     257
#SpeedIndonesia         14
Name: count, dtype: int64


# merge datasets together

In [31]:
# Load the cleaned datasets
df1 = pd.read_csv("cleaned_twitter_data.csv")
df2 = pd.read_csv("cleaned_twitter_data_english.csv")

# Merge the two datasets
df_combined = pd.concat([df1, df2], ignore_index=True)

In [33]:
# view the combined dataset
print("Combined Dataset:")
print(df_combined.head())

Combined Dataset:
                    id                  timestamp  \
0  1844183629653737485  2024-10-10 01:10:00+00:00   
1  1844171042832646341  2024-10-10 00:20:00+00:00   
2  1844034407159701602  2024-10-09 15:17:00+00:00   
3  1844128223300551170  2024-10-09 21:30:00+00:00   
4  1844122745422159957  2024-10-09 21:08:00+00:00   

                                                text       searchQuery  \
0  selagi gak masuk indonesia mah bodo amat.\nber...  speed Indonesia    
1  Gak sabar nunggu update selanjutnya tentang ak...  speed Indonesia    
2  การ์ดน้องหนาว จากอัลบั้ม aespa 에스파 The 5th Min...  speed Indonesia    
3                                    speed indonesia  speed Indonesia    
4  Gue banget yang ikutin perjalanan Speed di Ind...  speed Indonesia    

                                       clean_content  
0  selagi gak masuk indonesia mah bodo amat\nbera...  
1  Gak sabar nunggu update selanjutnya tentang ak...  
2  การดนองหนาว จากอลบม aespa 에스파 The 5th Mini Alb... 

In [34]:
# Save the combined dataset
df_combined.to_csv("combined_cleaned_twitter_data.csv", index=False)


In [36]:
# count the number of tweets per search query
tweet_count = df_combined['searchQuery'].value_counts()

# Display the number of tweets per search query
print("Tweet Count per Search Query:")
print(tweet_count)
# count the total number of tweets 
print("Total Number of Tweets:", len(df_combined))

Tweet Count per Search Query:
searchQuery
speed Indonesia        1053
speed in Indonesia      257
#SpeedIndonesia          14
Name: count, dtype: int64
Total Number of Tweets: 1324
