In [2]:
import pandas as pd
import os
import zipfile
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from langdetect import detect, LangDetectException
from googletrans import Translator # Assuming googletrans for Translator
# from textblob import TextBlob # Assuming you use TextBlob for sentiment if sid is not defined
# from nltk.sentiment.vader import SentimentIntensityAnalyzer # If you intend to use VADER for sentiment
from collections import defaultdict # Keep this if you use the defaultdict for logging duplicates within files
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
import xlsxwriter

In [None]:
# Download VADER lexicon (run this line only once)
nltk.download('vader_lexicon')

In [None]:
main_folder_path = ' '

# Initialize an empty list to store DataFrames
dfs = []
all_results = [] # This list will collect all processed tweet dictionaries

# Recursively traverse through all subfolders and files
for root, dirs, files in os.walk(main_folder_path):
    # Sort the list of subfolders and files chronologically
    dirs.sort()
    files.sort()

    for file_name in files:
        if file_name.endswith('.xlsx'):
            logging.info(f"Processing file: {file_name}") # Changed print to logging.info
            file_path = os.path.join(root, file_name)
            try:
                df = pd.read_excel(file_path, dtype={'id': str})
                dfs.append(df)

                # Process each tweet in the current DataFrame
                for _, row in df.iterrows():
                    tweet = row['text']

                    if isinstance(tweet, str):
                        tweet = tweet.strip()

                        if tweet:
                            # If you intend to use translation and sentiment, uncomment these lines
                            # translated_tweet = process_tweet(tweet)
                            # sentiment_scores = sid.polarity_scores(translated_tweet) # Make sure sid is initialized

                            all_results.append({
                                'creation_datetime': row['created_at'],
                                'hashtag': row['hashtag'],
                                'mention': row['mention'],
                                'Tweet Type': row['Tweet Type'],
                                'tweet_id': row['id'], # Use 'id' from the original dataframe
                                'text': tweet,
                                # 'sentiment_score': sentiment_scores # Uncomment if sentiment is calculated
                            })
            except Exception as e:
                logging.error(f"Error reading or processing file '{file_name}': {e}")

In [None]:
# --- Your existing duplicate check within individual files (good to keep) ---
duplicate_id_files = defaultdict(list)
total_dfs = len(dfs) # Define total_dfs here

for i, df in enumerate(dfs):
    logging.info(f"Analyzing DataFrame {i+1} of {total_dfs} for internal duplicates.") # Changed print to logging.info
    duplicated_ids = df[df.duplicated('id', keep=False)]
    if not duplicated_ids.empty:
        file_name = f"DataFrame_{i+1} (File: {os.path.basename(df.name) if hasattr(df, 'name') else 'Unknown'})" # Added file name for clarity
        for tweet_id in duplicated_ids['id'].unique():
            duplicate_id_files[tweet_id].append(file_name)

# Print sample problem tweet_ids and the file(s) they appeared in
if duplicate_id_files:
    logging.warning(f"Found tweet IDs duplicated WITHIN individual files. Examples:")
    for tweet_id, files in list(duplicate_id_files.items())[:10]:
        logging.warning(f"  Tweet ID {tweet_id} found in files: {files}")
else:
    logging.info("No tweet IDs duplicated within individual files.")

In [None]:
# Create a new DataFrame with the results for all tweets
result_df = pd.DataFrame(all_results)

# --- REMOVE THE DUPLICATION BLOCK HERE ---
# The block for result_df.drop_duplicates(...) should be removed.

# Convert tweet_id to string to avoid Excel scientific notation issue
# This line is correct and should remain.
result_df['tweet_id'] = result_df['tweet_id'].astype(str).apply(lambda x: f"'{x}'")

# Print or save the results
output_excel_path = ' '
try:
    result_df.to_excel(output_excel_path, index=False, engine='xlsxwriter')
    logging.info(f"Successfully saved combined data to '{output_excel_path}'. Total rows: {len(result_df)}")
except Exception as e:
    logging.error(f"Error saving output Excel file: {e}")

# --- Your existing duplicate check across the full dataset (for verification) ---
# This part is still useful to see if there are duplicates *after* combination,
# which is what you observed before, and now you explicitly want to keep them.
duplicate_ids_after_combine = result_df[result_df.duplicated('tweet_id', keep=False)]

if not duplicate_ids_after_combine.empty:
    logging.info(f"\n--- Analysis of Duplicated Tweet IDs Across Combined Dataset (as expected) ---")
    logging.info(f"Total distinct tweet IDs that are duplicated across the combined dataset: {duplicate_ids_after_combine['tweet_id'].nunique()}")
    logging.info(f"Count of each duplicated ID (showing top 10):")
    logging.info(duplicate_ids_after_combine['tweet_id'].value_counts().head(10))
    logging.info(f"Sample duplicated tweet_ids and texts (showing first 5 entries):")
    logging.info(duplicate_ids_after_combine[['tweet_id', 'text']].head(5))
    logging.info(f"Total rows in final result_df: {len(result_df)}")
else:
    logging.info("No tweet IDs are duplicated across the combined dataset (this would be unexpected if you had duplicates before).")