In [1]:
# Author: Andres Melendez
# Date: 9 / 27 / 2024
# Description: This script reads stock data from multiple CSV files, 
# adds ticker symbols, combines the data, and saves it to a new CSV file.

import pandas as pd

def load_data(file_path):
    """
    Load stock data from a CSV file.

    Parameters:
    file_path (str): The path to the CSV file.

    Returns:
    DataFrame: A Pandas DataFrame containing the stock data.
    """
    return pd.read_csv(file_path)



In [2]:
# Try to read in the CSV files with error handling
try:
    aapl = load_data('data/Mod5/aapl.csv')
    amzn = load_data('data/Mod5/amzn.csv')
    fb = load_data('data/Mod5/fb.csv')
    goog = load_data('data/Mod5/goog.csv')
    nflx = load_data('data/Mod5/nflx.csv')
except FileNotFoundError as e:
    print(f"Error: {e}")



In [3]:
# Add a column for the ticker symbol
aapl['ticker'] = 'AAPL'
amzn['ticker'] = 'AMZN'
fb['ticker'] = 'FB'
goog['ticker'] = 'GOOG'
nflx['ticker'] = 'NFLX'

# Combine DataFrames and reset index
faang = pd.concat([aapl, amzn, fb, goog, nflx], ignore_index=True)  # Reset index to ensure continuity

# Save the result in a CSV file called faang.csv
faang.to_csv('faang.csv', index=False)  # Exclude the index column in the output

# Display the first few rows of the combined DataFrame
print(faang.head())


         date       high        low       open      close       volume ticker
0  2018-01-02  43.075001  42.314999  42.540001  43.064999  102223600.0   AAPL
1  2018-01-03  43.637501  42.990002  43.132500  43.057499  118071600.0   AAPL
2  2018-01-04  43.367500  43.020000  43.134998  43.257500   89738400.0   AAPL
3  2018-01-05  43.842499  43.262501  43.360001  43.750000   94640000.0   AAPL
4  2018-01-08  43.902500  43.482498  43.587502  43.587502   82271200.0   AAPL


In [4]:
#this line reads the stock data from the faang.csv file and stores it in a DataFrame called faang
faang = pd.read_csv('faang.csv')

# Step 1: Convert the 'date' column to datetime
faang['date'] = pd.to_datetime(faang['date'])

# Step 2: Convert the 'volume' column to integers
faang['volume'] = faang['volume'].astype(int)

# Step 3: Sort by 'date' and 'ticker'
faang.sort_values(by=['date', 'ticker'], inplace=True)

# Optionally, reset the index after sorting
faang.reset_index(drop=True, inplace=True)

# Display the first few rows to verify changes
print(faang.head())

        date         high          low         open        close     volume  \
0 2018-01-02    43.075001    42.314999    42.540001    43.064999  102223600   
1 2018-01-02  1190.000000  1170.510010  1172.000000  1189.010010    2694500   
2 2018-01-02   181.580002   177.550003   177.679993   181.419998   18151900   
3 2018-01-02  1066.939941  1045.229980  1048.339966  1065.000000    1237600   
4 2018-01-02   201.649994   195.419998   196.100006   201.070007   10966900   

  ticker  
0   AAPL  
1   AMZN  
2     FB  
3   GOOG  
4   NFLX  


In [5]:
# Find the seven rows with the lowest volume
lowest_volume_rows = faang.nsmallest(7, 'volume')

# Display the result
print(lowest_volume_rows)


           date         high          low         open        close  volume  \
633  2018-07-03  1135.819946  1100.020020  1135.819946  1102.890015  679000   
1133 2018-11-23  1037.589966  1022.398987  1030.000000  1023.880005  691500   
498  2018-05-24  1080.469971  1066.150024  1079.000000  1079.239990  766800   
653  2018-07-10  1159.589966  1149.589966  1156.979980  1152.839966  798400   
763  2018-08-09  1255.541992  1246.010010  1249.900024  1249.099976  848600   
798  2018-08-20  1211.000000  1194.625977  1205.020020  1207.770020  870800   
808  2018-08-22  1211.839966  1199.000000  1200.000000  1207.329956  887400   

     ticker  
633    GOOG  
1133   GOOG  
498    GOOG  
653    GOOG  
763    GOOG  
798    GOOG  
808    GOOG  


In [6]:
# Melt the DataFrame to make it completely long format
faang_long = faang.melt(id_vars=['date', 'ticker'], 
                         value_vars=['open', 'high', 'low', 'close', 'volume'], 
                         var_name='measurement', 
                         value_name='value')

# Display the first few rows of the long format DataFrame
print(faang_long.head())


        date ticker measurement        value
0 2018-01-02   AAPL        open    42.540001
1 2018-01-02   AMZN        open  1172.000000
2 2018-01-02     FB        open   177.679993
3 2018-01-02   GOOG        open  1048.339966
4 2018-01-02   NFLX        open   196.100006


Handling data glitches, such as the one on July 26, 2018, requires careful consideration. Here are some steps you could take:

1. Identify the Glitch: Review the data for that specific date to understand the nature of the glitch. This could involve looking for anomalies, such as missing values, unrealistic high or low values, or inconsistencies in the data.

2. Assess Impact: Determine how this glitch affects your analysis. If it's a minor issue that doesn't significantly impact results, you might consider keeping the data with a note.

3. Correction Options:
    - Imputation: If possible, estimate or impute reasonable values based on surrounding data (e.g., using averages from adjacent days).
    - Removal: If the data is too compromised, you might choose to remove that date’s data entirely from your analysis.
    - Flagging: If you keep the data, flag the affected entries in your dataset to indicate that they may not be reliable.

4. Document the Issue: Make detailed notes about the glitch, how it was identified, and the steps taken to address it. This is important for transparency and for anyone reviewing your work.

5. Communicate Findings: If the data is to be shared or used in reporting, communicate the issue and how it was handled to stakeholders, ensuring they understand any limitations in the analysis.