In [None]:
# !pip install openpyxl


In [None]:
import pandas as pd
import os
from langdetect import detect, LangDetectException
from openai import OpenAI
client = OpenAI(api_key='')


In [2]:
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return None

def get_response(tweet='', model='gpt-4o'):
    prompt =f'''Please analyze the following tweet and determine whether it meets **both** of the following criteria:

            1. The tweet is written in Arabic.
            2. The tweet discusses the usage of artificial intelligence (AI) in learning or education.

            If both conditions are met, respond with "Yes." If not, respond with "No."

            Tweet: {tweet}

            '''
    
    response = client.chat.completions.create(
        model=model,
        messages=[{
            'role': 'user',
            'content': prompt
            }
        ],
        temperature=0,
        max_tokens=10
    )
    if 'Yes' in response.choices[0].message.content:
        return 'yes'


    return 'no'


In [3]:
# df = pd.read_csv('data/الذكاء_الاصطناعي/file3.csv')
# df.head()


In [4]:
# ls_of_tweets = list(df['tweet'][:3])
# for tweet in ls_of_tweets:
#     print(get_response(tweet))

In [5]:
import time
data_dir = 'data/'
dict_table_before = {}
dict_table_after = {}

# Loop over each folder in the data directory
for folder in os.listdir(data_dir):
    folder_path = os.path.join(data_dir, folder)
    dataframes = []  # Initialize the list to store DataFrames
    print(folder)
    print('='*50)
    # Check if it's a directory
    if os.path.isdir(folder_path):
        # Loop through each file in the folder
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            
            # Assuming files are CSVs
            if os.path.isfile(file_path) and file.endswith('.csv'):
                # Read the CSV file into a DataFrame and append to the list
                df = pd.read_csv(file_path)
                dataframes.append(df)

        # Concatenate all dataframes in the list into one DataFrame
        if dataframes:
            combined_df = pd.concat(dataframes, ignore_index=True)

            # Save initial aggregated data
            dict_table_before[folder] = {
                'all_like_counts': combined_df['like_count'].sum(),
                'all_retweet_counts': combined_df['retweet_count'].sum(),
                'all_tweet_counts_of_hashtag': len(combined_df)
            }
            

            # Data cleaning and formatting
            combined_df = combined_df.drop_duplicates(subset='tweet', keep='first')
            combined_df = combined_df.dropna(subset=['timestamp'])
            combined_df['timestamp'] = pd.to_datetime(combined_df['timestamp']).dt.strftime('%d/%m/%Y')
            combined_df['language'] = combined_df['tweet'].apply(detect_language)
            combined_df = combined_df[combined_df['language'] == 'ar']


            combined_df['is_about_learning'] = combined_df['tweet'].apply(
                lambda tweet: "yes" == get_response(tweet)
            )
            
            combined_df = combined_df[combined_df['is_about_learning'] ==True]

            try:
                save_path = 'cleaned_tweets/' + folder +'/combined_hashtag.csv' 
                combined_df.to_csv(save_path, index=False)

                save_path = 'cleaned_tweets/' + folder +'/combined_hashtag.xlsx' 
                combined_df.to_excel(save_path, index=False)
                
            except:
                pass
            # Save processed aggregated data
            dict_table_after[folder] = {
                'all_like_counts': combined_df['like_count'].sum(),
                'all_retweet_counts': combined_df['retweet_count'].sum(),
                'all_tweet_counts_of_hashtag': len(combined_df)
            }
        time.sleep(10)
    time.sleep(10)

        

# Optionally, output the results for verification
print("Initial Aggregated Data:")
for key, value in dict_table_before.items():
    print(f"Folder: {key}, Data: {value}")

print("\nProcessed and Filtered Data:")
for key, value in dict_table_after.items():
    print(f"Folder: {key}, Data: {value}")


الذكاء_الاصطناعي
الذكاء_الاصطناعي_في_التعليم
الذكاء_التوليدي
الذكاء_التوليدي_في_التعليم
الذكاءالاصطناعي
الذكاءالتوليدي
بالذكاء_الاصطناعي
بالذكاء_التوليدي
ذكاء_اصطناعي
ذكاء_التعليم
ذكاء_توليدي
Initial Aggregated Data:
Folder: الذكاء_الاصطناعي, Data: {'all_like_counts': 84809, 'all_retweet_counts': 20087, 'all_tweet_counts_of_hashtag': 6249}
Folder: الذكاء_الاصطناعي_في_التعليم, Data: {'all_like_counts': 42943, 'all_retweet_counts': 17250, 'all_tweet_counts_of_hashtag': 3176}
Folder: الذكاء_التوليدي, Data: {'all_like_counts': 24651, 'all_retweet_counts': 10703, 'all_tweet_counts_of_hashtag': 1789}
Folder: الذكاء_التوليدي_في_التعليم, Data: {'all_like_counts': 4872, 'all_retweet_counts': 8605, 'all_tweet_counts_of_hashtag': 242}
Folder: الذكاءالاصطناعي, Data: {'all_like_counts': 867, 'all_retweet_counts': 193, 'all_tweet_counts_of_hashtag': 326}
Folder: الذكاءالتوليدي, Data: {'all_like_counts': 6, 'all_retweet_counts': 1, 'all_tweet_counts_of_hashtag': 1}
Folder: بالذكاء_الاصطناعي, Data: {'

In [6]:
df_before = pd.DataFrame.from_dict(dict_table_before, orient='index')

# Optionally reset the index if you want the folder names as a separate column
df_before.reset_index(inplace=True)
df_before.rename(columns={'index': 'folder'}, inplace=True)

# Save to CSV
df_before.to_excel('dict_table_before.xlsx', index=False)

In [16]:
df_after= pd.DataFrame.from_dict(dict_table_after, orient='index')

# Optionally reset the index if you want the folder names as a separate column
df_after.reset_index(inplace=True)
df_after.rename(columns={'index': 'folder'}, inplace=True)

# Save to CSV
# df_after.to_excel('dict_table_after.xlsx', index=False)
df_after.head()

Unnamed: 0,folder,all_like_counts,all_retweet_counts,all_tweet_counts_of_hashtag
0,الذكاء_الاصطناعي,17783,3303,468
1,الذكاء_الاصطناعي_في_التعليم,30327,13814,2517
2,الذكاء_التوليدي,6995,5789,263
3,الذكاء_التوليدي_في_التعليم,3729,5308,185
4,الذكاءالاصطناعي,169,40,34


In [9]:
df_before.head()

Unnamed: 0,folder,all_like_counts,all_retweet_counts,all_tweet_counts_of_hashtag
0,الذكاء_الاصطناعي,84809,20087,6249
1,الذكاء_الاصطناعي_في_التعليم,42943,17250,3176
2,الذكاء_التوليدي,24651,10703,1789
3,الذكاء_التوليدي_في_التعليم,4872,8605,242
4,الذكاءالاصطناعي,867,193,326


In [17]:
df_after['percentage'] = ((df_after['all_tweet_counts_of_hashtag'] / sum(df_after['all_tweet_counts_of_hashtag'])) * 100).round(2)
df_after.head()

Unnamed: 0,folder,all_like_counts,all_retweet_counts,all_tweet_counts_of_hashtag,percentage
0,الذكاء_الاصطناعي,17783,3303,468,12.19
1,الذكاء_الاصطناعي_في_التعليم,30327,13814,2517,65.56
2,الذكاء_التوليدي,6995,5789,263,6.85
3,الذكاء_التوليدي_في_التعليم,3729,5308,185,4.82
4,الذكاءالاصطناعي,169,40,34,0.89


In [None]:
sum()

In [18]:
df_after.to_excel('dict_table_after.xlsx', index=False)

In [15]:
dict_table_after

{'الذكاء_الاصطناعي': {'all_like_counts': 17783,
  'all_retweet_counts': 3303,
  'all_tweet_counts_of_hashtag': 468},
 'الذكاء_الاصطناعي_في_التعليم': {'all_like_counts': 30327,
  'all_retweet_counts': 13814,
  'all_tweet_counts_of_hashtag': 2517},
 'الذكاء_التوليدي': {'all_like_counts': 6995,
  'all_retweet_counts': 5789,
  'all_tweet_counts_of_hashtag': 263},
 'الذكاء_التوليدي_في_التعليم': {'all_like_counts': 3729,
  'all_retweet_counts': 5308,
  'all_tweet_counts_of_hashtag': 185},
 'الذكاءالاصطناعي': {'all_like_counts': 169,
  'all_retweet_counts': 40,
  'all_tweet_counts_of_hashtag': 34},
 'الذكاءالتوليدي': {'all_like_counts': 0,
  'all_retweet_counts': 0,
  'all_tweet_counts_of_hashtag': 0},
 'بالذكاء_الاصطناعي': {'all_like_counts': 5190,
  'all_retweet_counts': 1027,
  'all_tweet_counts_of_hashtag': 172},
 'بالذكاء_التوليدي': {'all_like_counts': 271,
  'all_retweet_counts': 69,
  'all_tweet_counts_of_hashtag': 8},
 'ذكاء_اصطناعي': {'all_like_counts': 39,
  'all_retweet_counts': 21