In [4]:
import statistics
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
import datapipe_utils as dt

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
%matplotlib inline
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_curve, auc, precision_recall_curve


In [5]:
df_train = pd.read_csv('C:\\Users\\Tushar\\Downloads\\1store_train.csv')
df_test =pd.read_csv('C:\\Users\\Tushar\\Downloads\\1store_test.csv')

In [6]:
df_train.tail()

Unnamed: 0,Id,sales0,sales1,sales2,sales3,sales4,country,State,CouSub,countyname,storecode,Areaname,countytownname,population,state_alpha,store_Type,store
3333,2502354415,1133,862,867,1446,1531,23,25,54415,Plymouth County,METRO14460MM1200,"Brockton, MA HUD Metro FMR Area",Plympton town,2820.0,MA,Supermarket Type1,1
3334,4814599999,643,432,475,887,891,145,48,99999,Falls County,NCNTY48145N48145,"Falls County, TX",Falls County,17866.0,TX,Grocery Store,0
3335,2122399999,737,507,592,1020,1154,223,21,99999,Trimble County,METRO31140M31140,"Louisville, KY-IN HUD Metro FMR Area",Trimble County,8809.0,KY,Supermarket Type3,1
3336,2104599999,558,410,412,760,782,45,21,99999,Casey County,NCNTY21045N21045,"Casey County, KY",Casey County,15955.0,KY,Grocery Store,0
3337,2302303355,873,693,736,1132,1425,23,23,3355,Sagadahoc County,METRO38860N23023,"Sagadahoc County, ME HUD Metro FMR Area",Bath city,8514.0,ME,Supermarket Type2,1


In [7]:
df_train["storecode"].value_counts().head(10)

storecode
METRO14460MM1120    87
NCNTY23003N23003    48
METRO25540M25540    35
METRO39300M39300    34
METRO12620N23019    33
METRO44140M44140    32
NCNTY23029N23029    32
NCNTY33009N33009    29
NCNTY23017N23017    28
NCNTY33007N33007    27
Name: count, dtype: int64

In [8]:
df_train.shape

(3338, 17)

In [9]:
df_train["Areaname"].nunique()

1891

In [10]:
df_train1=df_train.drop(["Id","storecode","state_alpha","countyname"],axis=1)

In [11]:
df_train1["store"].value_counts()

store
0    1875
1    1463
Name: count, dtype: int64

In [12]:
df_train1["sales0"].nunique(),df_train1["sales1"].nunique(),df_train1["sales2"].nunique(),df_train1["sales3"].nunique(),df_train1["sales4"].nunique()

(453, 409, 433, 580, 675)

In [13]:
df_train1["sales0"].nunique()

453

In [14]:
df_test1=df_test.drop(["Id","storecode","state_alpha","countyname"],axis=1)

In [15]:
df_test1

Unnamed: 0,sales0,sales1,sales2,sales3,sales4,country,State,CouSub,Areaname,countytownname,population,store_Type
0,696,511,514,867,1034,17.0,1,99999,"Chambers County, AL",Chambers County,34215.0,Supermarket Type1
1,599,481,500,883,894,19.0,1,99999,"Cherokee County, AL",Cherokee County,25989.0,Supermarket Type1
2,599,423,475,802,1061,21.0,1,99999,"Chilton County, AL HUD Metro FMR Area",Chilton County,43643.0,Supermarket Type1
3,599,459,462,883,886,35.0,1,99999,"Conecuh County, AL",Conecuh County,13228.0,Supermarket Type1
4,599,481,505,746,801,37.0,1,99999,"Coosa County, AL",Coosa County,11539.0,Supermarket Type3
...,...,...,...,...,...,...,...,...,...,...,...,...
1426,627,493,524,924,1010,25.0,22,99999,"Catahoula Parish, LA",Catahoula Parish,10407.0,Supermarket Type1
1427,651,461,506,811,944,269.0,48,99999,"King County, TX",King County,286.0,Supermarket Type1
1428,922,743,748,1182,1232,13.0,23,33840,"Knox County, ME",Hope town,1536.0,Supermarket Type3
1429,593,402,438,739,862,81.0,19,99999,"Hancock County, IA",Hancock County,11341.0,Supermarket Type1


In [16]:
#Id : Store ID 
#numeric sale figures for 5 types :
#sales0
#sales1
#sales2
#sales3
#sales4

#country : categorical :: coded values for country 
#State : categorical :: coded values for State
#CouSub : numeric :: subscription values at county level
#countyname : Categorical :: county names
#storecode : categorical :: store codes , this should not be used as is but can be source of a feature
#Areaname : categorical :: name of the area , many times matches with county name
#countytownname : categorical :: county town name
#population : numeric :: population of the store area
#state_alpha : categorical :: short codes for state
#store_Type : categorical :: type of store 
#store : categorical 1/0 : target indicator var 1=opened 0=not opened 


In [17]:
df_train1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3338 entries, 0 to 3337
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sales0          3338 non-null   int64  
 1   sales1          3338 non-null   int64  
 2   sales2          3338 non-null   int64  
 3   sales3          3338 non-null   int64  
 4   sales4          3338 non-null   int64  
 5   country         3338 non-null   int64  
 6   State           3338 non-null   int64  
 7   CouSub          3338 non-null   int64  
 8   Areaname        3338 non-null   object 
 9   countytownname  3338 non-null   object 
 10  population      3337 non-null   float64
 11  store_Type      3338 non-null   object 
 12  store           3338 non-null   int64  
dtypes: float64(1), int64(9), object(3)
memory usage: 339.1+ KB


In [18]:
df_train1.describe()

Unnamed: 0,sales0,sales1,sales2,sales3,sales4,country,State,CouSub,population,store
count,3338.0,3338.0,3338.0,3338.0,3338.0,3338.0,3338.0,3338.0,3337.0,3338.0
mean,809.332534,570.453865,641.10695,1067.905932,1218.340024,73.943679,30.168664,80731.565608,70764.34,0.438286
std,236.00544,168.792737,189.71265,298.884119,352.002767,96.602661,14.991198,30818.495056,285559.4,0.496251
min,405.0,314.0,327.0,519.0,570.0,1.0,1.0,100.0,0.0,0.0
25%,643.0,453.0,502.0,860.0,953.5,13.0,21.0,64102.5,4247.0,0.0
50%,718.0,519.0,574.0,960.0,1132.0,35.0,27.0,99999.0,15911.0,0.0
75%,924.0,647.0,735.0,1199.0,1411.0,106.5,42.0,99999.0,42745.0,1.0
max,2062.0,1291.0,1635.0,2801.0,3386.0,840.0,78.0,99999.0,9818605.0,1.0


In [19]:
df_train1.isnull().sum()

sales0            0
sales1            0
sales2            0
sales3            0
sales4            0
country           0
State             0
CouSub            0
Areaname          0
countytownname    0
population        1
store_Type        0
store             0
dtype: int64

In [20]:
df_test1.isnull().sum()

sales0            0
sales1            0
sales2            0
sales3            0
sales4            0
country           1
State             0
CouSub            0
Areaname          0
countytownname    0
population        1
store_Type        0
dtype: int64

In [21]:
df_train1['Areaname'].nunique()

1891

In [22]:
df_train1['store_Type'].unique()

array(['Supermarket Type1', 'Supermarket Type3', 'Grocery Store',
       'Supermarket Type2'], dtype=object)

In [23]:
df_train1["population"] =df_train1["population"].fillna(df_train["population"].median())

In [24]:
df_train1['population'].isnull().sum()

0

In [25]:
df_test1["population"] =df_test1["population"].fillna(df_train["population"].median())

In [26]:
df_test1['population'].isnull().sum()

0

In [27]:
df_test1["country"] =df_test1["country"].fillna(df_test1["population"].median())

In [28]:
df_test1['country'].isnull().sum()

0

In [29]:
df_test1["country"].nunique()

193

In [30]:
df_train1

Unnamed: 0,sales0,sales1,sales2,sales3,sales4,country,State,CouSub,Areaname,countytownname,population,store_Type,store
0,848,588,666,1116,1133,9,23,19770,"Hancock County, ME",Eastbrook town,423.0,Supermarket Type1,0
1,925,717,780,1283,1550,1,50,29575,"Addison County, VT",Granville town,298.0,Supermarket Type1,0
2,924,616,739,1154,1314,13,25,8470,"Springfield, MA HUD Metro FMR Area",Brimfield town,3609.0,Supermarket Type1,1
3,924,646,683,1292,1297,35,6,99999,"Lassen County, CA",Lassen County,34895.0,Supermarket Type3,0
4,1017,730,735,1208,1326,27,50,60100,"Windsor County, VT",Rochester town,1139.0,Supermarket Type1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3333,1133,862,867,1446,1531,23,25,54415,"Brockton, MA HUD Metro FMR Area",Plympton town,2820.0,Supermarket Type1,1
3334,643,432,475,887,891,145,48,99999,"Falls County, TX",Falls County,17866.0,Grocery Store,0
3335,737,507,592,1020,1154,223,21,99999,"Louisville, KY-IN HUD Metro FMR Area",Trimble County,8809.0,Supermarket Type3,1
3336,558,410,412,760,782,45,21,99999,"Casey County, KY",Casey County,15955.0,Grocery Store,0


In [31]:
df_train1=pd.get_dummies(df_train1,columns=["Areaname","store_Type","countytownname"],dtype=int,drop_first=True)

In [32]:
df_test1=pd.get_dummies(df_test1,columns=["Areaname","store_Type","countytownname"],dtype=int,drop_first=True)

In [33]:
df_train1

Unnamed: 0,sales0,sales1,sales2,sales3,sales4,country,State,CouSub,population,store,...,countytownname_Yazoo County,countytownname_Yell County,countytownname_Yellowstone County,countytownname_Yolo County,countytownname_York County,countytownname_York town,countytownname_Young County,countytownname_Yuba County,countytownname_Yuma County,countytownname_Zapata County
0,848,588,666,1116,1133,9,23,19770,423.0,0,...,0,0,0,0,0,0,0,0,0,0
1,925,717,780,1283,1550,1,50,29575,298.0,0,...,0,0,0,0,0,0,0,0,0,0
2,924,616,739,1154,1314,13,25,8470,3609.0,1,...,0,0,0,0,0,0,0,0,0,0
3,924,646,683,1292,1297,35,6,99999,34895.0,0,...,0,0,0,0,0,0,0,0,0,0
4,1017,730,735,1208,1326,27,50,60100,1139.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3333,1133,862,867,1446,1531,23,25,54415,2820.0,1,...,0,0,0,0,0,0,0,0,0,0
3334,643,432,475,887,891,145,48,99999,17866.0,0,...,0,0,0,0,0,0,0,0,0,0
3335,737,507,592,1020,1154,223,21,99999,8809.0,1,...,0,0,0,0,0,0,0,0,0,0
3336,558,410,412,760,782,45,21,99999,15955.0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
df_train1["store"]

0       0
1       0
2       1
3       0
4       0
       ..
3333    1
3334    0
3335    1
3336    0
3337    1
Name: store, Length: 3338, dtype: int64

In [35]:
X = df_train1.drop("store", axis=1)
y = df_train1["store"]


In [36]:
X

Unnamed: 0,sales0,sales1,sales2,sales3,sales4,country,State,CouSub,population,"Areaname_Abilene, TX MSA",...,countytownname_Yazoo County,countytownname_Yell County,countytownname_Yellowstone County,countytownname_Yolo County,countytownname_York County,countytownname_York town,countytownname_Young County,countytownname_Yuba County,countytownname_Yuma County,countytownname_Zapata County
0,848,588,666,1116,1133,9,23,19770,423.0,0,...,0,0,0,0,0,0,0,0,0,0
1,925,717,780,1283,1550,1,50,29575,298.0,0,...,0,0,0,0,0,0,0,0,0,0
2,924,616,739,1154,1314,13,25,8470,3609.0,0,...,0,0,0,0,0,0,0,0,0,0
3,924,646,683,1292,1297,35,6,99999,34895.0,0,...,0,0,0,0,0,0,0,0,0,0
4,1017,730,735,1208,1326,27,50,60100,1139.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3333,1133,862,867,1446,1531,23,25,54415,2820.0,0,...,0,0,0,0,0,0,0,0,0,0
3334,643,432,475,887,891,145,48,99999,17866.0,0,...,0,0,0,0,0,0,0,0,0,0
3335,737,507,592,1020,1154,223,21,99999,8809.0,0,...,0,0,0,0,0,0,0,0,0,0
3336,558,410,412,760,782,45,21,99999,15955.0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [35]:
X_train.shape,X_test.shape

((2670, 4273), (668, 4273))

In [36]:
scaler=StandardScaler()

In [37]:
X_train=scaler.fit_transform(X_train)

In [38]:
X_test=scaler.transform(X_test)

In [39]:
X_train

array([[ 0.1713871 , -0.3851331 ,  0.17505176, ..., -0.01935645,
        -0.01935645,  0.        ],
       [ 0.48965359,  1.03432506,  0.57578621, ..., -0.01935645,
        -0.01935645,  0.        ],
       [-0.85979632, -1.00874442, -1.00606032, ..., -0.01935645,
        -0.01935645,  0.        ],
       ...,
       [ 0.17563065,  0.11375596,  0.14341483, ..., -0.01935645,
        -0.01935645,  0.        ],
       [-0.66034932, -0.52767283, -0.81623873, ..., -0.01935645,
        -0.01935645,  0.        ],
       [ 0.58301176,  0.69579319,  0.70760676, ..., -0.01935645,
        -0.01935645,  0.        ]])

# Decision Tree Classifier

In [40]:
reg = DecisionTreeClassifier(criterion = 'gini',
                           max_depth=100,
                           min_samples_split=100,
                           random_state=40)

In [41]:
reg.fit(X_train,y_train)

In [42]:
y_pred1 =reg.predict(X_test)

In [43]:
y_pred1

array([0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,

In [44]:
reg.score(X_test,y_test)

0.7305389221556886

In [45]:
score1=reg.predict_proba(X_train)[:-1]

In [46]:
score1

array([[0.75      , 0.25      ],
       [0.75      , 0.25      ],
       [0.        , 1.        ],
       ...,
       [1.        , 0.        ],
       [0.75      , 0.25      ],
       [0.60227273, 0.39772727]])

In [47]:
score=reg.predict_proba(X_test)[:-1]

In [48]:
score

array([[1.        , 0.        ],
       [1.        , 0.        ],
       [0.72727273, 0.27272727],
       ...,
       [0.08080808, 0.91919192],
       [0.        , 1.        ],
       [0.08080808, 0.91919192]])

# Logistic Regression

In [49]:
log_reg=LogisticRegression()

In [50]:
log_reg.fit(X_train,y_train)

In [51]:
y_pred2=log_reg.predict(X_test)

In [52]:
y_pred2

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,

In [53]:
sc3=log_reg.predict_proba(X_train)[:,-1]

In [54]:
sc3

array([7.66877124e-04, 3.49209768e-03, 9.98293116e-01, ...,
       1.61368794e-03, 8.23670853e-04, 9.94335676e-01])

In [55]:
sc4=log_reg.predict_proba(X_test)[:,-1]

In [56]:
sc4

array([3.93882601e-02, 3.50869342e-02, 7.07301748e-02, 1.02016527e-01,
       9.99689926e-01, 4.16186389e-01, 4.61270776e-01, 2.22645534e-01,
       7.36960875e-01, 9.34995472e-01, 4.12930503e-05, 9.99842463e-01,
       4.11247764e-02, 4.85022094e-03, 5.41584845e-02, 6.75783183e-01,
       1.35264059e-02, 9.97403859e-01, 8.85732233e-01, 3.25600475e-04,
       1.37390116e-02, 4.79341018e-02, 7.96432315e-01, 9.04279805e-01,
       3.40749113e-03, 6.67587200e-01, 8.60748256e-01, 9.99997113e-01,
       3.80117988e-04, 3.83684922e-02, 3.28323241e-02, 8.80292192e-02,
       4.26891127e-03, 5.44816454e-01, 3.36900736e-01, 6.16615955e-02,
       4.29243031e-03, 4.35417372e-01, 1.35462599e-02, 1.96613783e-02,
       5.14912768e-01, 9.74135696e-01, 7.25918778e-01, 9.62823558e-01,
       1.29857480e-02, 3.41664270e-03, 8.51248628e-02, 5.99582533e-03,
       9.94245804e-01, 2.77840395e-02, 9.32967401e-01, 9.80646533e-01,
       4.36096761e-02, 4.02303114e-01, 2.99879313e-02, 4.47773044e-02,
      

In [57]:
log_reg.score(X_test,y_test)

0.7170658682634731

# Random Forest

In [58]:
y_train.shape

(2670,)

In [2]:
rf_params={ 'class_weight':[None,'balanced'], 
        'criterion':['entropy','gini'],
        'max_depth':[None,100,200,250,300,350,400,500,600,650,700,1000,1500,2000,2500,3000,3500,4000,4500,5000],
            'min_samples_leaf':[1,2,5,10,15,20], 
            'min_samples_split':[2,5,10,15,20]
       }

In [3]:
rf_model=RandomForestClassifier()

NameError: name 'RandomForestClassifier' is not defined

In [4]:
rf_gs=RandomizedSearchCV(rf_model,
                        param_distributions=rf_params,
                        scoring='roc_auc',
                        cv=20,
                        n_iter=70,
                        n_jobs=-1,
                        verbose=20)

NameError: name 'RandomizedSearchCV' is not defined

In [5]:
rf_model.fit(X_train, y_train)

NameError: name 'rf_model' is not defined

In [63]:
y_pred=rf_model.predict(X_test)

In [64]:
y_pred

array([0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1,

In [65]:
np.array(y_test)

array([0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,

In [66]:
sc=log_reg.predict_proba(X_train)[:,1]

In [1]:
sc

''

In [116]:
sc1=rf_model.predict_proba(X_test)

In [117]:
sc1

array([[0.99, 0.01],
       [0.88, 0.12],
       [0.85, 0.15],
       ...,
       [0.02, 0.98],
       [0.29, 0.71],
       [0.83, 0.17]])

In [118]:
rf_model.score(X_test,y_test)

0.7544910179640718

In [68]:
prediction_store = pd.DataFrame({'Price': y_pred})

In [69]:
prediction_store

Unnamed: 0,Price
0,0
1,0
2,0
3,0
4,1
...,...
663,0
664,1
665,1
666,1


In [70]:
# Specify the file path where you want to save the CSV file
file = 'store.csv'

# Export predictions to a CSV file
prediction_store.to_csv(file, index=False)

print(f'store predictions exported to {file}.')

store predictions exported to store.csv.
