In [1]:
import pandas as pd
import os
import datetime as dt

# Import Data and libraryes

In [2]:
def read_data(pth1:str, pth2:str) -> pd.DataFrame:
    try: df = pd.read_csv(pth1)
    except OSError: df = pd.read_csv(pth2)
    return df

In [3]:
pth1 = 'Documents/=КУРС/weather/london_weather.csv'
pth2 = 'Documents/=КУРС/weather/london_weather.csv'

In [4]:
df = read_data(pth1, pth2) 

In [5]:
df.head()

Unnamed: 0,date,cloud_cover,sunshine,global_radiation,max_temp,mean_temp,min_temp,precipitation,pressure,snow_depth
0,19790101,2.0,7.0,52.0,2.3,-4.1,-7.5,0.4,101900.0,9.0
1,19790102,6.0,1.7,27.0,1.6,-2.6,-7.5,0.0,102530.0,8.0
2,19790103,5.0,0.0,13.0,1.3,-2.8,-7.2,0.0,102050.0,4.0
3,19790104,8.0,0.0,13.0,-0.3,-2.6,-6.5,0.0,100840.0,2.0
4,19790105,6.0,2.0,29.0,5.6,-0.8,-1.4,0.0,102250.0,1.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15341 entries, 0 to 15340
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              15341 non-null  int64  
 1   cloud_cover       15322 non-null  float64
 2   sunshine          15341 non-null  float64
 3   global_radiation  15322 non-null  float64
 4   max_temp          15335 non-null  float64
 5   mean_temp         15305 non-null  float64
 6   min_temp          15339 non-null  float64
 7   precipitation     15335 non-null  float64
 8   pressure          15337 non-null  float64
 9   snow_depth        13900 non-null  float64
dtypes: float64(9), int64(1)
memory usage: 1.2 MB


In [7]:
df.duplicated().sum()

0

In [8]:
df.isna().sum()

date                   0
cloud_cover           19
sunshine               0
global_radiation      19
max_temp               6
mean_temp             36
min_temp               2
precipitation          6
pressure               4
snow_depth          1441
dtype: int64

In [9]:
def add_symbols(date: int) -> str:
    date = list(str(date))
    date = ''.join(date[0:4:]) + '/' + ''.join(date[4:6:]) + '/' + ''.join(date[6:8:])
    return date

In [10]:
df.date = df.date.apply(add_symbols)

In [11]:
df.sample(5)

Unnamed: 0,date,cloud_cover,sunshine,global_radiation,max_temp,mean_temp,min_temp,precipitation,pressure,snow_depth
2443,1985/09/09,7.0,0.9,79.0,22.7,17.5,14.1,0.0,102360.0,0.0
7628,1999/11/20,5.0,1.7,35.0,8.0,5.2,3.6,0.0,102260.0,0.0
9496,2004/12/31,6.0,1.6,26.0,12.6,10.6,8.5,0.0,102340.0,0.0
9251,2004/04/30,8.0,0.0,71.0,13.3,10.0,7.5,3.4,100230.0,0.0
9600,2005/04/14,7.0,2.5,122.0,11.8,9.2,6.7,0.0,100110.0,0.0


In [12]:
df.date = pd.to_datetime(df.date, format= "%Y/%m/%d")

In [13]:
df.sample(5)

Unnamed: 0,date,cloud_cover,sunshine,global_radiation,max_temp,mean_temp,min_temp,precipitation,pressure,snow_depth
5980,1995-05-17,8.0,0.1,85.0,14.6,10.2,8.5,5.0,100010.0,0.0
2299,1985-04-18,3.0,9.5,226.0,20.2,12.6,6.5,0.0,102860.0,0.0
9342,2004-07-30,6.0,11.0,273.0,28.0,21.4,17.4,0.0,101880.0,0.0
6336,1996-05-07,4.0,12.3,287.0,14.1,9.2,3.3,0.0,101870.0,0.0
5494,1994-01-16,4.0,4.9,49.0,4.5,3.6,2.0,0.0,101330.0,0.0


# Fixing Nans

## snow_depth

It logically to presume that there is no snow in summer.Therefore there is no snow depth

In [14]:
df.sample(5)

Unnamed: 0,date,cloud_cover,sunshine,global_radiation,max_temp,mean_temp,min_temp,precipitation,pressure,snow_depth
10043,2006-07-01,0.0,11.5,298.0,29.7,23.0,16.3,0.0,102290.0,
1303,1982-07-27,5.0,4.5,184.0,22.7,14.2,10.3,0.0,102380.0,0.0
15011,2020-02-06,4.0,3.5,68.0,9.6,4.8,0.0,0.0,103250.0,
10184,2006-11-19,4.0,6.6,64.0,12.4,6.8,1.1,7.2,101550.0,0.0
2511,1985-11-16,6.0,0.4,25.0,6.7,3.1,-1.7,10.8,102770.0,0.0


In [15]:
df.loc[~df.date.dt.strftime('%m').isin(['01', '02','03', '12', '11']), :].snow_depth.isna().sum()

1259

In [16]:
df.update(df.loc[~df.date.dt.strftime('%m').isin(['01', '02','03', '12', '11']), :].snow_depth.fillna(0))

In [17]:
df.isna().sum()

date                  0
cloud_cover          19
sunshine              0
global_radiation     19
max_temp              6
mean_temp            36
min_temp              2
precipitation         6
pressure              4
snow_depth          182
dtype: int64

In [18]:
df.sample(5)

Unnamed: 0,date,cloud_cover,sunshine,global_radiation,max_temp,mean_temp,min_temp,precipitation,pressure,snow_depth
9668,2005-06-21,6.0,13.1,320.0,26.1,20.8,15.4,0.0,102000.0,0.0
7906,2000-08-24,1.0,12.4,255.0,26.2,19.9,12.8,0.0,102340.0,0.0
10112,2006-09-08,0.0,12.2,218.0,21.2,15.5,9.8,0.0,102970.0,0.0
1894,1984-03-09,7.0,0.6,56.0,8.1,5.0,3.0,0.2,104050.0,0.0
12545,2013-05-07,0.0,12.6,290.0,17.9,15.9,9.1,0.6,101610.0,0.0


In [19]:
df.describe()

Unnamed: 0,date,cloud_cover,sunshine,global_radiation,max_temp,mean_temp,min_temp,precipitation,pressure,snow_depth
count,15341,15322.0,15341.0,15322.0,15335.0,15305.0,15339.0,15335.0,15337.0,15159.0
mean,2000-01-01 00:00:00,5.268242,4.350238,118.756951,15.388777,11.475511,7.559867,1.668634,101536.605594,0.034831
min,1979-01-01 00:00:00,0.0,0.0,8.0,-6.2,-7.6,-11.8,0.0,95960.0,0.0
25%,1989-07-02 00:00:00,4.0,0.5,41.0,10.5,7.0,3.5,0.0,100920.0,0.0
50%,2000-01-01 00:00:00,6.0,3.5,95.0,15.0,11.4,7.8,0.0,101620.0,0.0
75%,2010-07-02 00:00:00,7.0,7.2,186.0,20.3,16.0,11.8,1.6,102240.0,0.0
max,2020-12-31 00:00:00,9.0,16.0,402.0,37.9,29.0,22.3,61.8,104820.0,22.0
std,,2.070072,4.028339,88.898272,6.554754,5.729709,5.326756,3.73854,1049.722604,0.522587


In [20]:
df.snow_depth.dropna()

0        9.0
1        8.0
2        4.0
3        2.0
4        1.0
        ... 
15275    0.0
15276    0.0
15277    0.0
15278    0.0
15279    0.0
Name: snow_depth, Length: 15159, dtype: float64

## mean temperature

# Making model 