In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, learning_curve,cross_val_score
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

In [None]:
df=pd.read_csv('data/train.csv')

In [None]:
import ast

def convert_to_dict(string):
    dic = ast.literal_eval(string)
    return dic
df['product_description_dic']=df['product_description'].apply(convert_to_dict)


In [None]:
def split_dict_column():
    feature_list=['دسته بندی','برند']
    dic={}
    for feature_name in feature_list:
        val_list=[]
        for i in range(0,df.shape[0]):
            key=list(df['product_description_dic'][i].keys())
            for j in range(len(key)):
                if key[j]==feature_name:
                    val=list(df['product_description_dic'][i].values())[j]
                    val_list.append(val)
        dic[feature_name]=val_list
    new_df=pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in dic.items() ]))
    result = pd.concat([df, new_df], axis=1)
    result.drop(columns=['product_description_dic'],inplace=True)
    result.to_csv('final_df.csv')
    return new_df

split_dict_column()

In [None]:
df=pd.read_csv('final_df.csv')

In [None]:
df['brand']=df['برند']
df['category']=df['دسته بندی']

In [None]:
df.drop(columns=['Unnamed: 0','برند','دسته بندی','product_description'],inplace=True)


In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
cat_feature_list=['category','brand']
for i in cat_feature_list:
    df[i] = pd.Categorical(df[i])
    df[i] = df[i].cat.codes

In [None]:
values = df.groupby('brand').price.median().sort_values().index
brand_weights = range(1, len(values)+1)
brand_dic = {val: weights for (val, weights) in zip(values, brand_weights)}
df.brand.replace(brand_dic, inplace=True)

In [None]:
values = df.groupby('category').price.mean().sort_values().index
category_weights = range(1, len(values)+1)
category_dic = {val: weights for (val, weights) in zip(values, category_weights)}
df.category.replace(category_dic, inplace=True)

In [None]:
df.groupby('brand').price.mean().sort_values()

In [None]:
df.groupby('category').price.mean().sort_values()

In [None]:
a=df.corr()
a['price'].sort_values()

In [None]:
X= df.drop(["price"],axis =1)
y= df["price"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=5)
X_train.shape

In [None]:
X_test

In [None]:
y_test.shape

In [None]:
lasso=Lasso()
dt=DecisionTreeRegressor()
knn=KNeighborsRegressor()
poly_pipline = make_pipeline(
    PolynomialFeatures(degree=4, include_bias=False),
    LinearRegression() 
)
models=[poly_pipline,lasso,dt,knn]
for model in models:
    model.fit(X_train, y_train)

In [None]:
cv_results_rms = []
for model in models:
    cv_score = cross_val_score(model, X_train,y_train,scoring="neg_root_mean_squared_error", cv=5)
    cv_results_rms.append(cv_score)
    print("%s: %f " % (model, -1 * cv_score.mean()))

In [None]:
df.price.mean()

In [None]:
y_dt_pred=dt.predict(X_test) 
print("R^2 decisionTree:",r2_score(y_test, y_dt_pred))

In [None]:
y_poly_pred=poly_pipline.predict(X_test) 
print("R^2 poly:",r2_score(y_test, y_poly_pred))

In [None]:
y_knn_pred=knn.predict(X_test)
print("R^2 poly:",r2_score(y_test, y_knn_pred))

In [None]:
df_test=pd.read_csv('data/test.csv')

In [None]:
import ast

def convert_to_dict(string):
    dic = ast.literal_eval(string)
    return dic
df_test['product_description_dic']=df_test['product_description'].apply(convert_to_dict)


In [None]:
def split_dict_column():
    feature_list=['دسته بندی','برند']
    dic={}
    for feature_name in feature_list:
        val_list=[]
        for i in range(0,df_test.shape[0]):
            key=list(df_test['product_description_dic'][i].keys())
            for j in range(len(key)):
                if key[j]==feature_name:
                    val=list(df_test['product_description_dic'][i].values())[j]
                    val_list.append(val)
        dic[feature_name]=val_list
    new_df=pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in dic.items() ]))
    result = pd.concat([df_test, new_df], axis=1)
    result.drop(columns=['product_description_dic'],inplace=True)
    result.to_csv('final_df_test.csv')
    return new_df

split_dict_column()


In [None]:
df_test=pd.read_csv('final_df_test.csv')
df_test['brand']=df_test['برند']
df_test['category']=df_test['دسته بندی']
df_test.drop(columns=['Unnamed: 0','برند','دسته بندی','product_description'],inplace=True)
cat_feature_list=['category','brand']


for i in cat_feature_list:
    df_test[i] = pd.Categorical(df_test[i])
    df_test[i] = df_test[i].cat.codes


In [None]:
df_test

In [None]:
price=knn.predict(df_test) 

In [None]:
price_df=pd.DataFrame(price)

In [None]:
price_df['price']=price_df[0]

In [None]:
result

In [None]:
result = pd.concat([df_test, price_df], axis=1)

In [None]:
result.drop(columns=['brand','category',0],inplace=True)

In [None]:
result.to_csv('output.csv',index=False)

In [None]:
pd.read_csv('output.csv')