# Import

In [42]:
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.linear_model import *

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder

df = pd.read_csv("data/saler_info.csv")
print(df.head())
print(df.dtypes)

  type  market  period  price  promotion  advertisement  level  demand
0    A       1       1   6300      10000          10000   1.00     104
1    A       1       2   6400      10000          10000   1.00      85
2    A       1       3   6400      10000          15000   1.00      92
3    A       1       4   6500      15000          15000   1.00      82
4    A       1       5   6500      15000          20000   1.24      91
type              object
market             int64
period             int64
price              int64
promotion          int64
advertisement      int64
level            float64
demand             int64
dtype: object


# Preprocess

In this part, we do a onehotencoder mainly.

In [43]:
tree_features = ['type', 'market']
regression_features = ['period', 'price', 'promotion', 'advertisement', 'level']

def preprocess(df):
    encoder = OneHotEncoder()
    encoded_column = encoder.fit_transform(df[tree_features])
    encoded_column_names = encoder.get_feature_names_out(tree_features)
    encoded_df = pd.DataFrame(encoded_column.toarray(), columns=encoded_column_names)
    df = df.drop(columns=tree_features)
    df = pd.concat([df, encoded_df], axis=1)
    return df
    
    
df_processed = preprocess(df)
print(df_processed.head())

   period  price  promotion  advertisement  level  demand  type_A  type_B  \
0       1   6300      10000          10000   1.00     104     1.0     0.0   
1       2   6400      10000          10000   1.00      85     1.0     0.0   
2       3   6400      10000          15000   1.00      92     1.0     0.0   
3       4   6500      15000          15000   1.00      82     1.0     0.0   
4       5   6500      15000          20000   1.24      91     1.0     0.0   

   type_C  type_D  market_1  market_2  market_3  market_4  
0     0.0     0.0       1.0       0.0       0.0       0.0  
1     0.0     0.0       1.0       0.0       0.0       0.0  
2     0.0     0.0       1.0       0.0       0.0       0.0  
3     0.0     0.0       1.0       0.0       0.0       0.0  
4     0.0     0.0       1.0       0.0       0.0       0.0  


# Make Tranin and Test Data

In [44]:
X = df_processed.drop('demand', axis=1)
y = df_processed['demand']

In [45]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the base models
base_models = [
   ('decision_tree', DecisionTreeRegressor(random_state=42)),
   ('linear_regression', Ridge())
]

# Initialize StackingRegressor with the base models and a final linear regression as the final model
stacked_model = StackingRegressor(estimators=base_models, final_estimator=LinearRegression())

# Fit the model to the training data
stacked_model.fit(X_train, y_train)

# Evaluate the model
print(f"Training score: {stacked_model.score(X_train, y_train)}")
print(f"Test score: {stacked_model.score(X_test, y_test)}")

Training score: 0.9944707401919897
Test score: 0.9837878590590775


# Predict

In [46]:
def predict(filename = 'data_to_predict/saler_info.csv'\
    , savefilename = 'output/saler_info.csv'):
    new_data = pd.read_csv(filename)
    if 'demand' in new_data.columns:
        new_data = new_data.drop('demand', axis= 1)
    df_processed = df.drop('demand', axis=1)
    size = df_processed.shape[0]
    merged_data = pd.concat([new_data, df_processed], axis=0, ignore_index=True)
    merged_data = preprocess(merged_data)
    merged_data = merged_data.drop(merged_data.index[-size:])
    predictions = stacked_model.predict(merged_data)
    new_data['demand'] = predictions
    new_data.to_csv(savefilename, index=False)
    
predict()