In [113]:
# idea of this notebook: train a ML model with the standard deviation of the atmospheric pressure to estimate if a location is suited for generating wind power
# Imports
import pandas as pd
import seaborn as sns
from random import randint

In [114]:
data = pd.read_parquet('../data/full_df.pqt')
# drop useless cols
data = data[['temp','dwpt', 'rhum', 'prcp','wspd', 'wpgt','pres','id']]


In [115]:
# load or compute aggregated data
std_data = data.groupby(['id']).agg('std')
mean_data = pd.read_parquet('../data/all_means_df.pqt')

In [116]:
# get the na counts for each col
nacounts = pd.concat([
    data.loc[data[col].isna() == False].groupby(['id'])[col].count()
    for col in data.columns
],axis=1)

In [117]:
# use only ids with sufficient data points
mean_data_filtered = mean_data.loc[nacounts.wspd > 50]

In [118]:
# choose a cutoff percentile, if the mean wspd of a given location is greater it's suitable for wind power generation
cutoff_percentile = .6
cutoff_wspd = mean_data_filtered.sort_values('wspd',ascending=True).iloc[round(cutoff_percentile*len(mean_data_filtered))].wspd
# add the col suitable 
mean_data_filtered.loc[:,'suitable'] = 0
mean_data_filtered.loc[mean_data_filtered.wspd > cutoff_wspd,'suitable'] = 1
# set the y data for the ml algorithm
y = mean_data_filtered.suitable

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [119]:
# use the std as x data
x = std_data.loc[y.index]

In [None]:
# fill missing values with mean of col, drop wind data to simulate missing data
x.fillna(x.mean(),inplace=True)
x.drop(['wspd','wpgt'],inplace=True,axis=1)

In [122]:
# train test split
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(
    x,
    y,
    test_size=.2,
    stratify=y,
    random_state=1
)

In [123]:
# import Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [124]:
# run models

##### Decision Tree

# Fitting des Decision Tree Algorithmus auf den Trainingsdaten mit der Zielvariable 'Kauf' und den Eingabevariablen 'predictor_attributes'
decision_tree_train = DecisionTreeClassifier(max_depth=10, criterion="entropy", random_state=1)
decision_tree_train.fit(xtrain, ytrain)

# Der Decision Tree Algorithmus auf Basis der Trainingsdaten liefert folgende Prognosegenauigkeit
print("Prognosegenauigkeit Decision Tree:\n")
print("Testdaten:\t", decision_tree_train.score(xtest, ytest))


##### Random Forest

# Fitting des Random Forest Algorithmus auf den Trainingsdaten mit der Zielvariable 'Kauf' und den Eingabevariablen 'predictor_attributes'
random_forest_train = RandomForestClassifier(random_state=1)
random_forest_train.fit(xtrain, ytrain)

# Der Random Forest Algorithmus auf Basis der Trainingsdaten liefert folgende Prognosegenauigkeit
print("\n\nPrognosegenauigkeit Random Forest:\n")
print("Testdaten:\t", random_forest_train.score(xtest, ytest))


##### Logistic Regression

# Fitting des Logistic Regression Algorithmus auf den Trainingsdaten mit der Zielvariable 'Kauf' und den Eingabevariablen 'predictor_attributes'
logistic_regression_train = LogisticRegression(max_iter=1000)
logistic_regression_train.fit(xtrain, ytrain)

# Der Logistic Regression Algorithmus auf Basis der Trainingsdaten liefert folgende Prognosegenauigkeit
print("\n\nPrognosegenauigkeit Logistic Regression:\n")
print("Testdaten:\t", logistic_regression_train.score(xtest, ytest))


##### K-Nearest Neighbors

# Fitting des K-Nearest Neighbors Algorithmus auf den Trainingsdaten mit der Zielvariable 'Kauf' und den Eingabevariablen 'predictor_attributes'
knn_train = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
knn_train.fit(xtrain, ytrain)

# Der K-Nearest Neighbors Algorithmus auf Basis der Trainingsdaten liefert folgende Prognosegenauigkeit
print("\n\nPrognosegenauigkeit K-Nearest Neighbors:\n")
print("Testdaten:\t", knn_train.score(xtest, ytest))

##### Neuronales Netz

# Fitting des Neuronalen Netz Algorithmus auf den Trainingsdaten mit der Zielvariable 'Kauf' und den Eingabevariablen 'predictor_attributes'
neuronales_netz_train = MLPClassifier(max_iter=1000)
neuronales_netz_train.fit(xtrain, ytrain)

# Der Neuronales Netz Algorithmus auf Basis der Trainingsdaten liefert folgende Prognosegenauigkeit
print("\n\nPrognosegenauigkeit Neuronales Netz:\n")
print("Testdaten:\t", neuronales_netz_train.score(xtest, ytest))

##### Gradient Tree Boosting

# Fitting des Gradient Boosting Algorithmus auf den Trainingsdaten mit der Zielvariable 'Kauf' und den Eingabevariablen 'predictor_attributes'
gradient_booosting_train = GradientBoostingClassifier()
gradient_booosting_train.fit(xtrain, ytrain)

# Der Gradient Boosting Algorithmus auf Basis der Trainingsdaten liefert folgende Prognosegenauigkeit
print("\n\nPrognosegenauigkeit Gradient Tree Boosting:\n")
print("Testdaten:\t", gradient_booosting_train.score(xtest, ytest))


Prognosegenauigkeit Decision Tree:

Testdaten:	 0.7681159420289855


Prognosegenauigkeit Random Forest:

Testdaten:	 0.782608695652174


Prognosegenauigkeit Logistic Regression:

Testdaten:	 0.7971014492753623


Prognosegenauigkeit K-Nearest Neighbors:

Testdaten:	 0.7681159420289855


Prognosegenauigkeit Neuronales Netz:

Testdaten:	 0.6956521739130435


Prognosegenauigkeit Gradient Tree Boosting:

Testdaten:	 0.7536231884057971


In [145]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest,random_forest_train.predict(xtest))

array([[39,  3],
       [12, 15]], dtype=int64)

In [149]:
# choosing the random forest since the accuracy is high and leads to some positive predictions
# predict the suitability of all x-data
# start by creating full x data
full_x = std_data.fillna(std_data.mean()).drop(['wspd','wpgt'],axis=1)

In [150]:
# instantiate new algorithm, train it with all the data
random_forest = RandomForestClassifier(random_state=1)
random_forest.fit(x,y)
full_x.loc[:,'prediction_suitable'] = random_forest.predict(full_x)

In [151]:
full_x.prediction_suitable.value_counts()

0    921
1    167
Name: prediction_suitable, dtype: int64

In [152]:
# export data
full_x.to_csv('../data/wind_model_prediction.csv')