In [None]:
from sklearn.model_selection import train_test_split
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, ensemble
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

store = pd.read_csv('superstore_data.csv')
store.head()

In [None]:
# analiza danych statystycznych
print(store.describe())
print(store.info())
print(store.isnull().values.any())

In [None]:
store["Age"]= 2022 - store["Year_Birth"]
store.Age[:10]

In [None]:
#rozkład wieku
store.Age.hist()

In [None]:
#rozkład wieku wraz z odpowiedzią na kampanię
pd.crosstab(store.Age, store.Response).T

In [None]:
#sprawdzenie wszystkich kolumn
store.columns

In [None]:
# rozmiar
store.shape

In [None]:
# sprawdzenie brakujących danych
store.isna().sum()

In [None]:
#wypełnienie brakujących danych w Income wartością 0

store.Income = store.Income.fillna(0)

In [None]:
# wyznaczenie macierzy korelacji

correlations = store.corr()
fig, ax = plt.subplots(figsize=(18, 18))
colormap = sns.color_palette("BrBG", 10)
ax= sns.heatmap(correlations, annot= True, linewidths= 2, cmap=colormap)
plt.show()

In [None]:
# usuwanie nieprzydatnych danych
store.drop(['Id', 'Complain', 'Response', 'Year_Birth', 'NumWebVisitsMonth' ,'Dt_Customer'],
           axis=1, inplace = True)

In [None]:
mapsMaritalStatus = {
    'Single': 0,
    'Married': 1,
    'Together': 2,
    'Divorced': 3,
    'Widow': 4,
    'YOLO': 5,
    'Alone': 6,
    'Absurd': 7
}
store['Marital_Status'] = store['Marital_Status'].apply(lambda x: mapsMaritalStatus[x])

In [None]:
wyksztalcenie = {
    'Graduation': 0,
    'PhD': 1,
    '2n Cycle': 2,
    'Master': 3,
    'Basic': 4
}
store['Education'] = store['Education'].apply(lambda x: wyksztalcenie[x])

In [None]:
# wyznaczenie macierzy korelacji

correlations = store.corr()
fig, ax = plt.subplots(figsize=(18, 18))
colormap = sns.color_palette("BrBG", 10)
ax= sns.heatmap(correlations, annot= True, linewidths= 2, cmap=colormap)
plt.show()

In [None]:
#2. usuwanie nieprzydatnych danych
store.drop(['Education', 'Marital_Status', 'Recency', 'Teenhome', 'NumDealsPurchases', 'Age', 'MntGoldProds', 'Kidhome'],
           axis=1, inplace = True)

In [None]:
# wyznaczenie macierzy korelacji

correlations = store.corr()
fig, ax = plt.subplots(figsize=(18, 18))
colormap = sns.color_palette("BrBG", 10)
ax= sns.heatmap(correlations, annot= True, linewidths= 2, cmap=colormap)
plt.show()

In [None]:
# zbiór danych
X = store.drop('NumCatalogPurchases', axis=1).to_numpy()
X

In [None]:
# zbiór etykiet
y = store.loc[:, 'NumCatalogPurchases'].to_numpy()
y

In [None]:
# podział zbioru na dane treningowe i testowe
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345) 

In [None]:
# uniwersalna metoda do trenowania i oceny modeli

def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    # trenuj model
    classifier.fit(feature_vector_train, label)
    
    # wygeneruj przewidywania modelu dla zbioru testowego
    predictions = classifier.predict(feature_vector_valid)
    
    # dokonaj ewaluacji modelu na podstawie danych testowych
    scores = list(metrics.precision_recall_fscore_support(predictions, y_test,zero_division = 1))
    score_vals = [
        scores[0][0],
        scores[1][0],
        scores[2][0]
    ]
    score_vals.append(metrics.accuracy_score(predictions, y_test))
    return score_vals

In [None]:
# MODEL 1 - regresja logistyczna 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

classifier = linear_model.LogisticRegression(max_iter=1000)

accuracy = train_model(classifier, X_train, y_train, X_test)
accuracy_compare = {'LR': accuracy}
print ("LR: ", accuracy)

In [None]:
# MODEL 2 - Support Vector Machine
accuracy = train_model(svm.SVC(), X_train, y_train, X_test)
accuracy_compare['SVM'] = accuracy
print ("SVM" , accuracy)

In [None]:
# MODEL 3 - Random Forest Tree 
accuracy = train_model(ensemble.RandomForestClassifier(), X_train, y_train, X_test)
accuracy_compare['RF'] = accuracy
print ("RF: ", accuracy)

In [None]:
# porównanie modeli

df_compare = pd.DataFrame(accuracy_compare, index = ['precision', 'recall', 'f1 score', 'accuracy'])
df_compare.plot(kind='bar')

In [None]:
# działania korygujące - zastosowanie sieci neuronowej

# MODEL 4 - neural network
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000)
accuracy = train_model(mlp, X_train, y_train, X_test)
accuracy_compare['neural network'] = accuracy
print ("neural network" , accuracy)

In [None]:
# działania korygujące - hiperparametry

# MODEL 5 - Support Vector Machine
accuracy = train_model(svm.SVC(gamma='auto'), X_train, y_train, X_test)
accuracy_compare['SVM'] = accuracy
print ("SVM gamma='auto'" , accuracy)

# MODEL 6 - Support Vector Machine
accuracy = train_model(svm.SVC(kernel='sigmoid'), X_train, y_train, X_test)
accuracy_compare['SVM'] = accuracy
print ("SVM kernel='sigmoid'" , accuracy)

# MODEL 7 - Support Vector Machine
accuracy = train_model(svm.SVC(degree=4), X_train, y_train, X_test)
accuracy_compare['SVM'] = accuracy
print ("SVM degree=4" , accuracy)