In [25]:
import pandas as pd
import numpy as np
import os 

# The raw data from Oklahoma stations is too large to be pushed to GitHub
# The purpose of the following code is to split the raw data up to bite-
# sized chunks. 

# File path to original raw data. Will be deleted from repository
filepath = "OKCityStationData.csv"

os.path.getsize(filepath) # outputs 788589632 bytes so around 788.06 MB


df = pd.read_csv(filepath) # Read in as pd.DataFrame
df = df.copy() # Create copy to avoid modification of original

  df = pd.read_csv(filepath) # Read in as pd.DataFrame


In [None]:
stations = df['STATION'].unique() # List of unique entries of 'STATION' column

##
# Create a new pd.DataFrame for each 'STATION' value
# For each new dataframe write to the csv:
# Station_[insert station value].cvs


# Empty Dictionary to hold the station values as keys and the new dataframes
# as values.


for station in stations:
    station_df = df[df['STATION'] == station]
    station_df.reset_index(inplace=True) # get rid of index values from df
    station_df.drop(['index'], inplace = True,axis=1) #drop 'index' column from .reset_index method
    station_path = f'Station_{station}.csv'
    station_csv = station_df.to_csv(station_path)
    memory_bytes = os.path.getsize(station_path) 
    memory_mb = memory_bytes/(10**6)
    if memory_mb >=100:
        print(f"{station_path} file is too large\n")
        n_splits = memory_mb//100 + 3 # extra wiggle room
        print(f"Splitting {station_path} into {n_splits} parts:\n")
        splits = np.array_split(station_df,n_splits)
        i=1 # Counter
        for split in splits:
            split_path = f'Station_{station}_Part_{i}.csv'
            split_df = pd.DataFrame(split) # Write into pd.DataFrame 
            print(f"Creating {split_path}...")
            split_df.to_csv(split_path)
            print(f"{split_path} created, {len(splits)-(i)} remaining...\n")
            # makes sure file is small enough
            assert(os.path.getsize(split_path)/(10**6)<100)
            i=i+1
        print(f"Finished splitting {station_path}")
        print(f"Removing file {station_path}\n\n")
        os.remove(f'{station_path}')
    else:
        print(f"{station} csv file is fine, no splitting necessary.\n\n")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  station_df.drop(['index'], inplace = True,axis=1)


Station_72357003948.csv file is too large

Splitting Station_72357003948.csv into 5.0 parts:



  return bound(*args, **kwds)


Creating Station_72357003948_Part_1.csv...
Station_72357003948_Part_1.csv created, 4 remaining...

Creating Station_72357003948_Part_2.csv...
Station_72357003948_Part_2.csv created, 3 remaining...

Creating Station_72357003948_Part_3.csv...
Station_72357003948_Part_3.csv created, 2 remaining...

Creating Station_72357003948_Part_4.csv...
Station_72357003948_Part_4.csv created, 1 remaining...

Creating Station_72357003948_Part_5.csv...
Station_72357003948_Part_5.csv created, 0 remaining...

Finished splitting Station_72357003948.csv
Removing file Station_72357003948.csv




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  station_df.drop(['index'], inplace = True,axis=1)


99999903948 csv file is fine, no splitting necessary.




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  station_df.drop(['index'], inplace = True,axis=1)


Station_72354013919.csv file is too large

Splitting Station_72354013919.csv into 4.0 parts:



  return bound(*args, **kwds)


Creating Station_72354013919_Part_1.csv...
Station_72354013919_Part_1.csv created, 3 remaining...

Creating Station_72354013919_Part_2.csv...
Station_72354013919_Part_2.csv created, 2 remaining...

Creating Station_72354013919_Part_3.csv...
Station_72354013919_Part_3.csv created, 1 remaining...

Creating Station_72354013919_Part_4.csv...
Station_72354013919_Part_4.csv created, 0 remaining...

Finished splitting Station_72354013919.csv
Removing file Station_72354013919.csv




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  station_df.drop(['index'], inplace = True,axis=1)


72354499999 csv file is fine, no splitting necessary.




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  station_df.drop(['index'], inplace = True,axis=1)


Station_72353013967.csv file is too large

Splitting Station_72353013967.csv into 4.0 parts:



  return bound(*args, **kwds)


Creating Station_72353013967_Part_1.csv...
Station_72353013967_Part_1.csv created, 3 remaining...

Creating Station_72353013967_Part_2.csv...
Station_72353013967_Part_2.csv created, 2 remaining...

Creating Station_72353013967_Part_3.csv...
Station_72353013967_Part_3.csv created, 1 remaining...

Creating Station_72353013967_Part_4.csv...
Station_72353013967_Part_4.csv created, 0 remaining...

Finished splitting Station_72353013967.csv
Removing file Station_72353013967.csv




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  station_df.drop(['index'], inplace = True,axis=1)


72354099999 csv file is fine, no splitting necessary.




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  station_df.drop(['index'], inplace = True,axis=1)


72357099999 csv file is fine, no splitting necessary.




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  station_df.drop(['index'], inplace = True,axis=1)


99999903954 csv file is fine, no splitting necessary.




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  station_df.drop(['index'], inplace = True,axis=1)


Station_72354403954.csv file is too large

Splitting Station_72354403954.csv into 4.0 parts:



  return bound(*args, **kwds)


Creating Station_72354403954_Part_1.csv...
Station_72354403954_Part_1.csv created, 3 remaining...

Creating Station_72354403954_Part_2.csv...
Station_72354403954_Part_2.csv created, 2 remaining...

Creating Station_72354403954_Part_3.csv...
Station_72354403954_Part_3.csv created, 1 remaining...

Creating Station_72354403954_Part_4.csv...
Station_72354403954_Part_4.csv created, 0 remaining...

Finished splitting Station_72354403954.csv
Removing file Station_72354403954.csv




In [None]:
station_df = df[df['STATION'] == stations[3]]
station_df.reset_index(inplace=True)
station_df.drop(['index'],axis =1)

cleaned = pd.read_csv("/Users/taylormurray/Documents/GitHub/fall-2025-predicting-tornadoes/Data/OK City Station Data/Cleaned Data/cleaned_S6P1.csv")


  cleaned = pd.read_csv("/Users/taylormurray/Documents/GitHub/fall-2025-predicting-tornadoes/Data/OK City Station Data/Cleaned Data/cleaned_S6P1.csv")


[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [24]:
# 'Unnamed: 0' columns in some of the new csv's, we drop this.

file_list= os.listdir("/Users/taylormurray/Documents/GitHub/fall-2025-predicting-tornadoes/Data/OK City Station Data/Raw Data")

csv_file_list = [file for file in file_list if ".csv" in file]

for file in csv_file_list:
    data = pd.read_csv(file)
    
    print(data.columns)
    if 'Unnamed: 0' in data.columns:
        data = data.drop(['Unnamed: 0'], axis=1, inplace = True)
        assert("Unnamed: 0" not in data.columns)
        data.to_csv(f"../Cleaned Data/{file}")
    else:
        data.to_csv(f"../Cleaned Data/{file}")
        continue
        
        
        


  data = pd.read_csv(file)


Index(['Unnamed: 0', 'STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION',
       'DATE', 'SOURCE', 'REPORT_TYPE', 'CALL_SIGN',
       ...
       'RH3', 'SA1', 'SLP', 'SOURCE.1', 'TMP', 'UA1', 'UG1', 'VIS', 'WA1',
       'WND'],
      dtype='object', length=132)


AttributeError: 'NoneType' object has no attribute 'columns'