In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import lag_plot
from pandas.plotting import autocorrelation_plot
from statsmodels.graphics.tsaplots import plot_acf
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.ar_model import AutoReg
pd.options.mode.chained_assignment = None 

plt.rcParams['figure.figsize'] = (14,6)

In [1]:
def clean(file,csv_name):
    '''
    Cleaning of the Mean-Temperature Dataset 
    Parameters
    ----------
    file: str
        The data to load
    csv_name: str
        The output name of the csv-file
    Returns
    ---------
    csv-file in the folder
    '''
    df = pd.read_csv('TG_STAID002759.txt',sep=",", skiprows=19,  index_col=1, header=0, parse_dates=True)
    df = df.rename(columns=lambda x: x.strip())                         # remove white spaces in the column names 
    df['meanT'] = df['TG']*0.1                                          # give back the comma to the temperature numbers
    df.drop(df.head(25567).index, inplace=True)                         # remove all the data up untill 1946
    df.drop(df[['SOUID', 'Q_TG', 'TG']], axis =1, inplace = True)       # drop the useless columns
    df.to_csv(csv_name)             

In [14]:
def plot(csv_file):
    '''
    Vizualisation of the Temperature dataset(csv-file) 
    Parameters
    ----------
    csv_file: str
        The data to load
    Returns
    ---------
    plt.show() 
    '''
    df = pd.read_csv('data7.csv', index_col=0, parse_dates=True)
    plt.bar(x='Nan', height=df.isna().sum())
    plt.show()
    
    dates = df.index #assign x
    temp = df['meanT']#assign y

    plt.plot(dates,temp)
    plt.xlabel('dates')
    plt.ylabel('Mean Temp')
    plt.title('Temperature profile in Berlin')
    plt.show()
    
    year_2021 = df.index[-700:]
    temp_2021 = df['meanT'][-700:]
    plt.plot(year_2021,temp_2021)
    plt.xlabel('dates')
    plt.ylabel('Mean Temperature')
    plt.title('Temperature profile in Berlin 2021')
    plt.show()
    
    lag_plot(df)
    plt.show()
    
    autocorrelation_plot(df)
    plt.show()

    plot_acf(df, lags=30)
    plt.show()

In [15]:
def split_data(file):
    '''
    Load of the Temperature dataset(csv-file) and split into train and test data
    Parameters
    ----------
    csv_file: str
        The data to load
    Returns
    ---------
    train: pd.dataframe()
        The dataframe with the traindata
    test: pd.dataframe()
        The dataframe with the testdata 
    Xtrain: matrix
        The X values to train the model
    ytrain: vector
        The y values to train the model
    Xtest: matrix
        The X values to test the model
    ytest: vector
        The y values to test the model
    '''
    data = pd.read_csv(file, index_col=0, parse_dates=True)
    
    train = df[:-365]
    test = df[-365:]
    
    Xtrain = train.index
    ytrain = train['meanT']

    Xtest = test.index
    ytest = test['meanT']
    
    return train,test,Xtrain,ytrain,Xtest,ytest

In [16]:
def trend(df,X,y):
    '''
    Analyzing the Trend of the Temperature dataset.
    Parameters
    ----------
    df: pd.dataframe()
        The dataframe with the traindata
    Xtrain: matrix
        The X values to train the model
    ytrain: vector
        The y values to train the model
    Returns
    ---------
    train: pd.dataframe()
        The new dataframe with the timestep, trend column
    '''
    train['timestep'] = range(len(train))      
    Xtrend = train[['timestep']]                    
    X = train[['timestep']]
    y = train['meanT']
    m = LinearRegression()
    m.fit(X, y)
    trend = m.coef_*12*74  
    print('Trend: '+str(trend)+' °C')            
    print('intercept: ' + str(m.intercept_))
    train['trend'] = m.predict(X)
    train[['meanT', 'trend']].plot()
    return train

In [17]:
def seasonality(df,X,y):
    
    df['month'] = df.index.month #create column with no. of month
    seasonal_dummies = pd.get_dummies(df['month'],prefix='month', drop_first=True) # create dummies for each month
    df = df.merge(seasonal_dummies,left_index = True, right_index=True) #merge df seasonal_dummies and df
    Xseason = df.drop(['t_mean','trend','trend_poly','month'], axis=1) #choose X values for LinReg fit (all dummy columns)
    
    m = LinearRegression()
    m.fit(Xseason, y)
    df['trendANDseasonality'] = m.predict(Xseason) #create trend and seasonality column in df
      
    df[['t_mean','trendANDseasonality']].plot()#plot the seasonality and trend
    return df
    '''
    Analyzing the Seasonality of the Temperature dataset with the month column.
    Parameters
    ----------
    train: pd.dataframe()
        The dataframe with the data
    Xtrain: matrix
        The X values to train the model
    ytrain: vector
        The y values to train the model
    Returns
    ---------
    df: pd.dataframe()
        The new dataframe with the trendANDseasonality column and the dummies(month).
    '''    
    train['month'] = train.index.month                                      
    dummies = pd.get_dummies(train['month'],prefix='month', drop_first=True) 
    train = train.merge(dummies,left_index = True, right_index=True) 
    Xseason = train.drop(['t_mean','trend','month'], axis=1)  
    m = LinearRegression()
    m.fit(Xseason, y)
    train['tre_sea'] = m.predict(Xseason) 
    train[['meanT','tre_sea']].plot()
    return train

In [18]:
def remainder(train, csv_name):
    '''
    Analyzing the Remainder and save th df in a csv file.
    Parameters
    ----------
    train: pd.dataframe()
        The dataframe with the data
    Returns
    ---------
    train: pd.dataframe()
        The dataframe with all columns: .
    csv-file in the folder
    '''
    # Remainder
    train['remainder'] = train['meanT'] - train['tre_sea'] 
    train['remainder'].plot()
    train_rem = df['remainder']
    train_rem.to_csv(csv_output_name) 
    return train

In [12]:
df = pd.read_csv('TG_STAID002759.txt',sep=",", skiprows=19,  index_col=1, header=0, parse_dates=True)

In [13]:
df

Unnamed: 0_level_0,SOUID,TG,Q_TG
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1876-01-01,127488,22,0
1876-01-02,127488,25,0
1876-01-03,127488,3,0
1876-01-04,127488,-58,0
1876-01-05,127488,-98,0
...,...,...,...
2021-03-27,111448,69,0
2021-03-28,111448,77,0
2021-03-29,111448,144,0
2021-03-30,111448,149,0


In [14]:
df = df.rename(columns=lambda x: x.strip())        

In [15]:
df

Unnamed: 0_level_0,SOUID,TG,Q_TG
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1876-01-01,127488,22,0
1876-01-02,127488,25,0
1876-01-03,127488,3,0
1876-01-04,127488,-58,0
1876-01-05,127488,-98,0
...,...,...,...
2021-03-27,111448,69,0
2021-03-28,111448,77,0
2021-03-29,111448,144,0
2021-03-30,111448,149,0


In [16]:
df['meanT'] = df['TG']*0.1 

In [None]:
 df = pd.read_csv('TG_STAID002759.txt',sep=",", skiprows=19,  index_col=1, header=0, parse_dates=True)
    df = df.rename(columns=lambda x: x.strip())                         # remove white spaces in the column names 
    df['meanT'] = df['TG']*0.1                                          # give back the comma to the temperature numbers
    df.drop(df.head(25567).index, inplace=True)                         # remove all the data up untill 1946
    df.drop(df[['SOUID', 'Q_TG', 'TG']], axis =1, inplace = True)       # drop the useless columns
    df.to_csv(csv_name)             


In [8]:
# train autoregression
model = AutoReg(train, lags=29)
model_fit = model.fit()
print('Coefficients: %s' % model_fit.params)
# make predictions
predictions = model_fit.predict(start=len(train), end=len(train)+len(test), dynamic=False)
for i in range(len(predictions)):
    print('predicted=%f, expected=%f' % (predictions[i], test[i]))
rmse = sqrt(mean_squared_error(test, predictions))
print('Test RMSE: %.3f' % rmse)
# plot results
pyplot.plot(test)
pyplot.plot(predictions, color='red')
pyplot.show()

NameError: name 'train' is not defined