# This is the 2nd part of the total work process. [Click here](https://colab.research.google.com/drive/1lsHtQUS_mfGBv_afWIGO_f37nPi_Mfoq?usp=sharing) to see the first part

In [None]:
# Concatenating all the files into one dataframe


import pandas as pd

import glob

path = r'/content/'

all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:

    df = pd.read_csv(filename, index_col=None, header=0)

    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [None]:
frame.head()

Unnamed: 0,location,year,total_days_snowed,total_days_rained
0,syracuse,2019,86,245
1,portland,2019,10,180
2,san_jose,2019,0,96
3,augusta,2019,1,107
4,chicago,2019,45,235


In [None]:
frame.shape

(100, 4)

In [None]:
 # Sorted the dataframe by 'location' column for easier readability


frame_sorted = frame.sort_values(['location'], ascending=True)
frame_sorted.head()

Unnamed: 0,location,year,total_days_snowed,total_days_rained
34,akron,2019,64,253
11,albuquerque,2019,20,132
94,anaheim,2019,0,83
92,anchorage,2019,44,160
95,arlington,2019,0,171


In [None]:
# Cleaned 'location' column

frame_sorted['location'] = frame_sorted['location'].apply(lambda x: x.title())
frame_sorted['location'] = frame_sorted['location'].str.replace('_', ' ')
frame_sorted.head(50)

Unnamed: 0,location,year,total_days_snowed,total_days_rained
34,Akron,2019,64,253
11,Albuquerque,2019,20,132
94,Anaheim,2019,0,83
92,Anchorage,2019,44,160
95,Arlington,2019,0,171
32,Atlanta,2019,1,196
3,Augusta,2019,1,107
30,Aurora,2019,64,167
56,Austin,2019,1,197
24,Bakersfield,2019,0,106


In [None]:
# Downloaded the dataframe for future use

from google.colab import files
frame_sorted.to_csv(r'semi_final.csv', index = False)
files.download('semi_final.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
semi_final = pd.read_csv('semi_final.csv')
print(semi_final.shape)
semi_final.head()

(100, 4)


Unnamed: 0,location,year,total_days_snowed,total_days_rained
0,Akron,2019,64,253
1,Albuquerque,2019,20,132
2,Anaheim,2019,0,83
3,Anchorage,2019,44,160
4,Arlington,2019,0,171


In [None]:
# Renamed the 'loaction' column to 'city' so that we can merge the dataframe with the dataframe with unique identifier

semi_final = semi_final.rename(columns={"location": "city"})
semi_final.head()

Unnamed: 0,city,year,total_days_snowed,total_days_rained
0,Akron,2019,64,253
1,Albuquerque,2019,20,132
2,Anaheim,2019,0,83
3,Anchorage,2019,44,160
4,Arlington,2019,0,171


In [None]:
# This is the dataframe where we will get an unique identifier (city_id) for our dataframe

id_file = pd.read_csv('100city_state_data.csv')
print(id_file.shape)
id_file.head()

(100, 4)


Unnamed: 0,city_id,city,state,city_state
0,0,Anchorage,AK,"Anchorage, AK"
1,1,Chandler,AZ,"Chandler, AZ"
2,2,Gilbert,AZ,"Gilbert, AZ"
3,3,Glendale,AZ,"Glendale, AZ"
4,4,Mesa,AZ,"Mesa, AZ"


In [None]:
# merging both dataframe together

final = pd.merge(id_file, semi_final, on='city', how='inner')
print(final.shape)
final.head(50)

(100, 7)


Unnamed: 0,city_id,city,state,city_state,year,total_days_snowed,total_days_rained
0,0,Anchorage,AK,"Anchorage, AK",2019,44,160
1,1,Chandler,AZ,"Chandler, AZ",2019,0,88
2,2,Gilbert,AZ,"Gilbert, AZ",2019,0,85
3,3,Glendale,AZ,"Glendale, AZ",2019,0,79
4,4,Mesa,AZ,"Mesa, AZ",2019,0,85
5,5,Phoenix,AZ,"Phoenix, AZ",2019,1,74
6,6,Scottsdale,AZ,"Scottsdale, AZ",2019,0,89
7,7,Tucson,AZ,"Tucson, AZ",2019,1,93
8,8,Anaheim,CA,"Anaheim, CA",2019,0,83
9,9,Bakersfield,CA,"Bakersfield, CA",2019,0,106


In [None]:
final.tail(50)

Unnamed: 0,city_id,city,state,city_state,year,total_days_snowed,total_days_rained
50,50,Durham,NC,"Durham, NC",2019,0,194
51,51,Greensboro,NC,"Greensboro, NC",2019,3,195
52,52,Raleigh,NC,"Raleigh, NC",2019,0,203
53,53,Winston Salem,NC,"Winston Salem, NC",2019,4,185
54,54,Lincoln,NE,"Lincoln, NE",2019,29,170
55,55,Omaha,NE,"Omaha, NE",2019,36,198
56,56,Newark,NJ,"Newark, NJ",2019,18,210
57,57,Albuquerque,NM,"Albuquerque, NM",2019,20,132
58,58,Henderson,NV,"Henderson, NV",2019,2,62
59,59,Las Vegas,NV,"Las Vegas, NV",2019,3,58


In [None]:
final.sort_values(by=['city_id'])
final.head(50)

Unnamed: 0,city_id,city,state,city_state,year,total_days_snowed,total_days_rained
0,0,Anchorage,AK,"Anchorage, AK",2019,44,160
1,1,Chandler,AZ,"Chandler, AZ",2019,0,88
2,2,Gilbert,AZ,"Gilbert, AZ",2019,0,85
3,3,Glendale,AZ,"Glendale, AZ",2019,0,79
4,4,Mesa,AZ,"Mesa, AZ",2019,0,85


In [None]:
final.tail(50)

In [None]:
# Downloaded the dataframe for future use

from google.colab import files
final.to_csv(r'snow_and_rain.csv', index = False)
files.download('snow_and_rain.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[click here](https://colab.research.google.com/drive/1mgsddcrdNcRMAy2o95ifUuZSgcMrGfk0?usp=sharing) to see where the 'weather_average' data come from

In [None]:
import pandas as pd
df1 = pd.read_csv('weather_average.csv')
df2 = pd.read_csv('snow_and_rain.csv')

In [None]:
df1.head()

Unnamed: 0,city_id,city,state,city_state,summer_maxtempF_mean,summer_mintempF_mean,summer_humidity_mean,winter_maxtempF_mean,winter_mintempF_mean,winter_humidity_mean
0,0,Anchorage,AK,"Anchorage, AK",59,49,77,23,15,84
1,1,Chandler,AZ,"Chandler, AZ",101,83,30,67,49,39
2,2,Gilbert,AZ,"Gilbert, AZ",100,83,30,66,49,39
3,3,Glendale,AZ,"Glendale, AZ",100,83,29,66,49,38
4,4,Mesa,AZ,"Mesa, AZ",100,83,30,66,49,39


In [None]:
# merging with weather_average data

df_all = pd.merge(df1,df2[['city_id','total_days_snowed', 'total_days_rained']],on='city_id', how='left')
df_all.head()

Unnamed: 0,city_id,city,state,city_state,summer_maxtempF_mean,summer_mintempF_mean,summer_humidity_mean,winter_maxtempF_mean,winter_mintempF_mean,winter_humidity_mean,total_days_snowed,total_days_rained
0,0,Anchorage,AK,"Anchorage, AK",59,49,77,23,15,84,44,160
1,1,Chandler,AZ,"Chandler, AZ",101,83,30,67,49,39,0,88
2,2,Gilbert,AZ,"Gilbert, AZ",100,83,30,66,49,39,0,85
3,3,Glendale,AZ,"Glendale, AZ",100,83,29,66,49,38,0,79
4,4,Mesa,AZ,"Mesa, AZ",100,83,30,66,49,39,0,85


In [None]:
# Downloading the final data

from google.colab import files
df_all.to_csv(r'weather_average.csv', index = False)
files.download('weather_average.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Now we finally have all the necessary weather info in one dataset. This dataset was then merged with a bigger dataset which includes other data of the cities like population, unemployment, rental data etc.

If yo want to see all the notebooks used to work on weather data click the links below

[Notebook 1](https://colab.research.google.com/drive/1mgsddcrdNcRMAy2o95ifUuZSgcMrGfk0?usp=sharing)

[Notebook 2](https://colab.research.google.com/drive/12t8tEJqOOZTM5cYhfup9r2n9WgeTgByY?usp=sharing)

[Notebook 3](https://colab.research.google.com/drive/1lsHtQUS_mfGBv_afWIGO_f37nPi_Mfoq?usp=sharing)

[Notebook 4](https://colab.research.google.com/drive/1dp2r_YvLkOO9zQlBjk6ILjtySUqWNNou?usp=sharing)


