# Data Processing

## Data Aggregation

In [24]:
import pandas as pd

In [25]:
# read the two csv files
tweet_df = pd.read_csv("../data/tweet_data.csv")
stock_df = pd.read_csv("../data/stock_data.csv")

In [26]:
# transform the date column in the stock data to datetime and keep the date 
stock_df['Date'] = pd.to_datetime(stock_df['Date'])
stock_df['Date'] = stock_df['Date'].dt.date

In [27]:
# Transform the date column in the tweet data to datetime and keep the date
tweet_df['Date'] = pd.to_datetime(tweet_df['Date'])
tweet_df['Date'] = tweet_df['Date'].dt.date

In [28]:
# drop the full company name column from the tweet data
tweet_df.drop('Company Name', axis=1, inplace=True)

In [29]:
# print the columns of the two dataframes
print("Tweet Data Columns: ")
for col in tweet_df.columns:
    print(col)
print()
print("Stock Data Columns: ")
for col in stock_df.columns:
    print(col)

Tweet Data Columns: 
Date
Tweet
Stock Name

Stock Data Columns: 
Date
Open
High
Low
Close
Adj Close
Volume
Stock Name


In [30]:
# check how many dates figure in both dataframes
print("Number of dates in both data: ", len(set(tweet_df['Date'].unique()) & set(stock_df['Date'].unique())))

Number of dates in both data:  252


In [36]:
# join the two dataframes on the date column
joined_df = pd.merge(tweet_df, stock_df, 
                     on=['Date', 'Stock Name'], 
                     how='left')

In [40]:
# print the columns of the joined dataframe
print("Joined Data Columns: ")
for col in joined_df.columns:
    print(col)

# print the shape of the joined dataframe
print("Joined Data Shape: ", joined_df.shape)

Joined Data Columns: 
Date
Tweet
Stock Name
Open
High
Low
Close
Adj Close
Volume
Joined Data Shape:  (80793, 9)


In [41]:
# save the new dataframe to a csv file
joined_df.to_csv("../data/joined_data.csv", index=False)

## Data Cleaning