In [1]:
import pandas as pd
import re

# Load the Social Media Stream CSV file
df = pd.read_csv('social_media_stream.csv')

# Display basic information about the dataset
print("Initial dataframe shape:", df.shape)
print(df.head())
print(df.info())


Initial dataframe shape: (50000, 5)
   user_id                 text            timestamp   latitude   longitude
0   148149     fire in downtown  2023-01-01 00:00:00  37.891326 -122.119823
1   133822     fire in downtown  2023-01-01 00:01:00  37.261545 -121.592279
2   117131  emergency in Zone A  2023-01-01 00:02:00  37.976165 -122.425660
3   197271     flood near river  2023-01-01 00:03:00  37.217171 -121.747097
4   105884      earthquake felt  2023-01-01 00:04:00  37.812096 -121.624895
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   user_id    50000 non-null  int64  
 1   text       50000 non-null  object 
 2   timestamp  50000 non-null  object 
 3   latitude   50000 non-null  float64
 4   longitude  50000 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 1.9+ MB
None


In [2]:

# Remove duplicate rows to ensure data uniqueness
df.drop_duplicates(inplace=True)
print("Shape after removing duplicates:", df.shape)

# Remove rows where the 'text' column (tweet content) is missing
if 'text' in df.columns:
    df = df[df['text'].notnull()]
    print("Shape after filtering out missing text entries:", df.shape)



Shape after removing duplicates: (50000, 5)
Shape after filtering out missing text entries: (50000, 5)


In [3]:
# Convert the 'timestamp' column to a datetime object if it exists, and remove invalid entries
if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    df = df[df['timestamp'].notnull()]
    print("Shape after cleaning timestamps:", df.shape)

# Define a helper function for text cleaning
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove non-alphanumeric characters (keep only letters, numbers, and spaces)
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    return text.strip().lower()

# Clean the text and store it in a new column 'clean_text'
if 'text' in df.columns:
    df['clean_text'] = df['text'].apply(clean_text)
    print("Shape after cleaning text and filtering short entries:", df.shape)


Shape after cleaning timestamps: (50000, 5)
Shape after cleaning text and filtering short entries: (50000, 6)


In [5]:

# Save the cleaned DataFrame to a new CSV file
output_file = 'social_media_stream_cleaned.csv'
df.to_csv(output_file, index=False)
print(f"Cleaned Social Media Stream data saved successfully at {output_file}.")


Cleaned Social Media Stream data saved successfully at social_media_stream_cleaned.csv.
