In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import normaltest

In [None]:
dataset=pd.read_csv('train_data.csv')
dataset.head()

contest-slp-14d: file containing sea level pressure (slp)

nmme0-tmp2m-34w: file containing most recent monthly NMME model forecasts for tmp2m (cancm30,
cancm40, ccsm30, ccsm40, cfsv20, gfdlflora0, gfdlflorb0, gfdl0, nasa0,
nmme0mean) and average forecast across those models (nmme0mean)

contest-pres-sfc-gauss-14d: pressure

mjo1d: MJO phase and amplitude

contest-pevpr-sfc-gauss-14d: potential evaporation

contest-wind-h850-14d: geopotential height at 850 millibars

contest-wind-h500-14d: geopotential height at 500 millibars
contest-wind-h100-14d: geopotential height at 100 millibars

contest-wind-h10-14d: geopotential height at 10 millibars

contest-wind-vwnd-925-14d: longitudinal wind at 925 millibars

contest-wind-vwnd-250-14d: longitudinal wind at 250 millibars
contest-wind-uwnd-250-14d: zonal wind at 250 millibars

contest-wind-uwnd-925-14d: zonal wind at 925 millibars

contest-rhum-sig995-14d: relative humidity

contest-prwtr-eatm-14d: precipitable water for entire atmosphere
nmme-prate-34w: weeks 3-4 weighted average of monthly NMME model forecasts for precipitation

nmme-prate-56w: weeks 5-6 weighted average of monthly NMME model forecasts for precipitation
nmme0-prate-56w: weeks 5-6 weighted average of most recent monthly NMME model forecasts for precipitation

nmme0-prate-34w: weeks 3-4 weighted average of most recent monthly NMME model forecasts for precipitation

nmme-tmp2m-34w: weeks 3-4 weighted average of most recent monthly NMME model forecasts for target label, contest-tmp2m-14d__tmp2m

nmme-tmp2m-56w: weeks 5-6 weighted average of monthly NMME model forecasts for target label, contest-tmp2m-14d__tmp2m

mei: MEI (mei), MEI rank (rank), and Niño Index Phase (nip)

elevation: elevation

contest-precip-14d: measured precipitation

climateregions: Köppen-Geigerclimateclassifications

In [None]:
dataset.info()

In [None]:
train_data=dataset.copy()

Let's delete: the "index" column








Let's check what columns we have missing values and how many values we are missing

In [None]:
del train_data['index']

In [None]:
col_null_values=[colu for colu  in train_data.columns if  train_data[colu].isnull().any()]

In [None]:
col_null_values

In [None]:
miss_values=[]
for i in col_null_values:
    miss_values.append(train_data[i].isnull().sum())
miss_values

Let's create a dataframe with two columns: col_null_values and null values

In [None]:
missing_values_count={'column_name':col_null_values, 'null_values': miss_values}
missing_values_count=pd.DataFrame(missing_values_count)

In [None]:
missing_values_count

In [None]:
train_data['climateregions__climateregion'].unique()

Köppen climate classification scheme symbols description



A (Tropical)	
f (Rainforest)
m (Monsoon)
w (Savanna, dry winter)
s (Savanna, dry summer)

B (Dry)	
W (Arid Desert)
S (Semi-Arid or steppe)

h (Hot)
k (Cold)

C (Temperate)	
w (Dry winter)
f (No dry season)
s (Dry summer)

a (Hot summer)
b (Warm summer)
c (Cold summer)

D (Continental)	
w (Dry winter)
f (No dry season)
s (Dry summer)

a (Hot summer)
b (Warm summer)
c (Cold summer)
d (Very cold winter)

E (Polar)		
T (Tundra)
F (Ice cap)

In [None]:
train_data['year']=pd.DatetimeIndex(train_data['startdate']).year

In [None]:
data=train_data[['year','contest-precip-14d__precip']].sort_values(by='year',ascending=False)
data

In [None]:
data['year'].value_counts().head(30).plot(kind='barh', figsize=(6,10),color='#002542')
plt.title('Years',fontdict={'fontsize': 18})

In [None]:
sns.catplot(
    data=data,
    x='year',
    y='contest-precip-14d__precip',
    row='year',
    kind='box',
    height=3, 
    aspect=4,
    color='blue') 

In [None]:
sns.distplot(data['contest-precip-14d__precip'],bins=20).set_title('Precipitation')

In [None]:
data1=train_data[['elevation__elevation','contest-precip-14d__precip']]


In [None]:
correlation = data1.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlation,cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(data1.columns),1)
ax.set_xticks(ticks)
plt.xticks(rotation=90)
ax.set_yticks(ticks)
ax.set_xticklabels(data1.columns)
ax.set_yticklabels(data1.columns)
plt.show()

Normal test on target value:

In [None]:
train_data['contest-tmp2m-14d__tmp2m']

In [None]:
plt.hist(train_data['contest-tmp2m-14d__tmp2m'], bins=20, density=True, color='b')
plt.xlabel('Variable')
plt.ylabel('Frequency')
plt.title('contest-tmp2m-14d__tmp2m')
plt.show()

In [None]:
#significance
significance=0.05

In [None]:
stats,p_value=normaltest(train_data['contest-tmp2m-14d__tmp2m'])
print(stats)
print(p_value)

In [None]:
if(p_value <= significance):
    print('Reject H0')
else:
    print('Accept H0')