In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_log_error as rmsle
from sklearn.metrics import root_mean_squared_error
import xgboost as xgb

In [2]:
#data set from kaggle: https://www.kaggle.com/competitions/grupo-bimbo-inventory-demand/data

#office
train = pd.read_csv("train.csv", usecols=['Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Producto_ID', 'Cliente_ID', 'Demanda_uni_equil'])
test = pd.read_csv("test.csv", usecols=['Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Producto_ID', 'Cliente_ID', 'id'])

train = train.rename(columns={'Semana': 'Week_num',
                              'Agencia_ID': 'Sales_Depot_ID',
                              'Canal_ID': 'Sales_Channel_ID',
                              'Ruta_SAK': 'Route_ID',
                              'Cliente_ID': 'Client_ID',
                              'Venta_uni_hoy': 'Sales_unit_this_week',
                              'Venta_hoy': 'Sales_this_week',
                              'Dev_uni_proxima': 'Returns_unit_next_week',
                              'Dev_proxima': 'Returns_next_week',
                              'Demanda_uni_equil': 'adjusted_demand',
                              'Producto_ID': 'Product_ID'})

test = test.rename(columns={'Semana': 'Week_num',
                            'Agencia_ID': 'Sales_Depot_ID',
                            'Canal_ID': 'Sales_Channel_ID',
                            'Ruta_SAK': 'Route_ID',
                            'Cliente_ID': 'Client_ID',
                            'Venta_uni_hoy': 'Sales_unit_this_week',
                            'Venta_hoy': 'Sales_this_week',
                            'Dev_uni_proxima': 'Returns_unit_next_week',
                            'Dev_proxima': 'Returns_next_week',
                            'Demanda_uni_equil': 'adjusted_demand',
                            'Producto_ID': 'Product_ID'})



#set a unique id for each sales depot id, sales channel id, route id, client, product combination (thanks Gemini)
combined_df = pd.concat([train,test])
combined_df['ID'] = combined_df.groupby(['Sales_Depot_ID', 'Sales_Channel_ID', 'Route_ID', 'Client_ID', 'Product_ID']).ngroup()

#set a combined client ID, consisting of a unique sales depot ID, sales channel ID, route ID, and client ID
combined_df['ccid'] = combined_df.groupby(['Sales_Depot_ID', 'Sales_Channel_ID', 'Route_ID', 'Client_ID']).ngroup()

#set a combined product ID, consisting of a unique sales depot ID, sales channel ID, route ID, and product ID
combined_df['cpid'] = combined_df.groupby(['Sales_Depot_ID', 'Sales_Channel_ID', 'Route_ID', 'Product_ID']).ngroup()

train = combined_df.iloc[:len(train)].copy()
test = combined_df.iloc[len(train):].copy()

del combined_df


train = train.drop(columns='id')
train['adjusted_demand'] = train['adjusted_demand'].astype(int)
train = train.sort_values(by=['ID', 'Week_num']).reset_index(drop=True)

test = test.drop(columns='adjusted_demand')
test['id'] = test['id'].astype(int)
test = test.sort_values(by=['ID', 'Week_num']).reset_index(drop=True)

In [3]:
#create a dataframe of aggregate statistics for each client
testagg = train.sort_values(by=['ccid']).groupby(['ccid'], as_index=False).agg({'Product_ID':'nunique', 'adjusted_demand':['mean', 'median', 'min', 'max']})

client_stats = pd.DataFrame()

client_stats['ccid'] = testagg['ccid']
client_stats['Products'] = testagg['Product_ID']['nunique']
client_stats['adj_dem_mean'] = testagg['adjusted_demand']['mean'].round(2)
client_stats['adj_dem_median'] = testagg['adjusted_demand']['median'].astype(int)
client_stats['adj_dem_min'] = testagg['adjusted_demand']['min']
client_stats['adj_dem_max'] = testagg['adjusted_demand']['max']

del testagg

#create a dataframe of aggregate statistics for each product
testagg = train.sort_values(by=['cpid']).groupby(['cpid'], as_index=False).agg({'Client_ID':'nunique', 'adjusted_demand':['mean', 'median', 'min', 'max']})

product_stats =  pd.DataFrame()

product_stats['cpid'] = testagg['cpid']
product_stats['Clients'] = testagg['Client_ID']['nunique']
product_stats['adj_dem_mean'] = testagg['adjusted_demand']['mean'].round(2)
product_stats['adj_dem_median'] = testagg['adjusted_demand']['median'].astype(int)
product_stats['adj_dem_min'] = testagg['adjusted_demand']['min']
product_stats['adj_dem_max'] = testagg['adjusted_demand']['max']
product_stats['median_pct'] = product_stats['adj_dem_median'].rank(pct=True, method='average')

del testagg

In [4]:
# get ccid, cpid means, medians and cpid median percentage into training data
cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())].set_index('ccid')['adj_dem_mean'], index=client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())]['ccid']).to_dict()
train['ccid_mean'] = train['ccid'].map(cidmapping)

cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())].set_index('ccid')['adj_dem_median'], index=client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())]['ccid']).to_dict()
train['ccid_median'] = train['ccid'].map(cidmapping)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())].set_index('cpid')['adj_dem_mean'], index=product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())]['cpid']).to_dict()
train['cpid_mean'] = train['cpid'].map(pidmapping)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())].set_index('cpid')['adj_dem_median'], index=product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())]['cpid']).to_dict()
train['cpid_median'] = train['cpid'].map(pidmapping)

# pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())].set_index('cpid')['median_pct'], index=product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())]['cpid']).to_dict()
# train['cpid_median_pct'] = train['cpid'].map(pidmapping).round(3)

del cidmapping, pidmapping

In [5]:
print("There are", len(train['ID'].unique()), "unique training IDs.")
print("There are", len(test['ID'].unique()), "unique testing IDs.")

There are 26396648 unique training IDs.
There are 6237461 unique testing IDs.


In [5]:
# create training data based on ID in order to use lagged adjusted demand
trainIDdf = pd.DataFrame()

trainIDdf = train[['ID', 'Client_ID', 'Product_ID', 'ccid_mean', 'ccid_median', 'cpid_mean', 'cpid_median']].drop_duplicates(subset='ID', keep='first').reset_index(drop=True)

# get adjusted demand for the week for each ID 
for j in [3,4,5,6,7,8,9]:
    wkmap = pd.Series(train[train['Week_num'] == j].set_index('ID')['adjusted_demand'], index=train['ID'].unique()).to_dict()
    trainIDdf[f'Wk_{j}_dem'] = trainIDdf['ID'].map(wkmap)

trainIDdf['ID'] = trainIDdf['ID'].astype('category')
trainIDdf['Client_ID'] = trainIDdf['Client_ID'].astype('category')
trainIDdf['Product_ID'] = trainIDdf['Product_ID'].astype('category')

trainIDdf.head()

Unnamed: 0,ID,Client_ID,Product_ID,ccid_mean,ccid_median,cpid_mean,cpid_median,Wk_3_dem,Wk_4_dem,Wk_5_dem,Wk_6_dem,Wk_7_dem,Wk_8_dem,Wk_9_dem
0,0,15766,325,5.52,4,1.94,1,,1.0,,,,,
1,1,15766,328,5.52,4,1.91,2,,1.0,,,,,
2,2,15766,1212,5.52,4,2.48,2,3.0,4.0,5.0,,,4.0,1.0
3,3,15766,1216,5.52,4,2.25,2,4.0,2.0,3.0,1.0,2.0,5.0,
4,4,15766,1220,5.52,4,3.0,3,,,3.0,,,1.0,


In [6]:
del train

In [7]:
X_train = trainIDdf.dropna(subset=['Wk_9_dem']).iloc[:, 1:-1]
y_train = trainIDdf.dropna(subset=['Wk_9_dem']).iloc[:, -1]

X_train.drop(['Wk_3_dem', 'Wk_4_dem','Wk_5_dem'], axis=1, inplace=True)
X_train.rename(columns={'Wk_6_dem': 'lag_3', 'Wk_7_dem': 'lag_2', 'Wk_8_dem': 'lag_1'}, inplace=True)

X_train.head()

Unnamed: 0,Client_ID,Product_ID,ccid_mean,ccid_median,cpid_mean,cpid_median,lag_3,lag_2,lag_1
2,15766,1212,5.52,4,2.48,2,,,4.0
5,15766,1238,5.52,4,3.36,3,2.0,2.0,3.0
6,15766,1240,5.52,4,4.51,4,,8.0,2.0
7,15766,1242,5.52,4,2.89,3,3.0,2.0,1.0
8,15766,1250,5.52,4,6.81,6,1.0,14.0,8.0


In [8]:
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)

model = xgb.train({'max_depth': 8, 'eta': 0.1, 'objective': 'reg:squaredlogerror'}, dtrain, num_boost_round=100)

In [None]:
# get ccid, cpid means and medians into test data
cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())].set_index('ccid')['adj_dem_mean'], index=client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())]['ccid']).to_dict()
test['ccid_mean'] = test['ccid'].map(cidmapping)

cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())].set_index('ccid')['adj_dem_median'], index=client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())]['ccid']).to_dict()
test['ccid_median'] = test['ccid'].map(cidmapping)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(test['cpid'].unique().tolist())].set_index('cpid')['adj_dem_mean'], index=product_stats[product_stats['cpid'].isin(test['cpid'].unique().tolist())]['cpid']).to_dict()
test['cpid_mean'] = test['cpid'].map(pidmapping)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(test['cpid'].unique().tolist())].set_index('cpid')['adj_dem_median'], index=product_stats[product_stats['cpid'].isin(test['cpid'].unique().tolist())]['cpid']).to_dict()
test['cpid_median'] = test['cpid'].map(pidmapping)

del cidmapping, pidmapping

test = test[['id', 'ID', 'Week_num', 'Client_ID', 'Product_ID', 'ccid_mean', 'ccid_median', 'cpid_mean', 'cpid_median']].sort_values(by='id')


# get adjusted demand from previous weeks
lagmap = pd.Series(trainIDdf[['ID', 'Wk_7_dem']].set_index('ID')['Wk_7_dem'], index=trainIDdf['ID'].tolist()).to_dict()
test['lag_3'] = test['ID'].map(lagmap)

lagmap = pd.Series(trainIDdf[['ID', 'Wk_8_dem']].set_index('ID')['Wk_8_dem'], index=trainIDdf['ID'].tolist()).to_dict()
test['lag_2'] = test['ID'].map(lagmap)

lagmap = pd.Series(trainIDdf[['ID', 'Wk_9_dem']].set_index('ID')['Wk_9_dem'], index=trainIDdf['ID'].tolist()).to_dict()
test['lag_1'] = test['ID'].map(lagmap)

del lagmap


test['Client_ID'] = test['Client_ID'].astype('category')
test['Product_ID'] = test['Product_ID'].astype('category')

test.head()

Unnamed: 0,id,ID,Week_num,Client_ID,Product_ID,ccid_mean,ccid_median,cpid_mean,cpid_median,lag_3,lag_2,lag_1
6558101,0,25973294,11,4639078,35305,2.56,2.0,4.58,4.0,,,4.0
5991443,1,23662849,11,4705135,1238,3.15,2.0,2.0,2.0,,,
5398738,2,21257171,10,4549769,32940,7.77,4.0,2.4,2.0,,2.0,2.0
1383046,3,5334985,11,4717855,43066,2.0,1.0,1.33,1.0,,,
1110570,4,4150753,11,966351,1277,5.41,5.0,,,,,


In [23]:
X_test_wk10 = xgb.DMatrix(test[test['Week_num'] == 10].iloc[:, 3:], enable_categorical=True)

predictions_10 = model.predict(X_test_wk10)

In [37]:
test['wk_10_pred_dem'] = np.nan

test.loc[test['Week_num'] == 10, 'wk_10_pred_dem'] = predictions_10

In [38]:
test.head()

Unnamed: 0,id,ID,Week_num,Client_ID,Product_ID,ccid_mean,ccid_median,cpid_mean,cpid_median,lag_3,lag_2,lag_1,wk_10_pred_dem
6558101,0,25973294,11,4639078,35305,2.56,2.0,4.58,4.0,,,4.0,
5991443,1,23662849,11,4705135,1238,3.15,2.0,2.0,2.0,,,,
5398738,2,21257171,10,4549769,32940,7.77,4.0,2.4,2.0,,2.0,2.0,2.01327
1383046,3,5334985,11,4717855,43066,2.0,1.0,1.33,1.0,,,,
1110570,4,4150753,11,966351,1277,5.41,5.0,,,,,,


In [45]:
test_wk11 = test[test['Week_num'] == 11]

test_wk11.drop(['lag_3'], axis=1, inplace=True)
test_wk11.rename(columns={'lag_2': 'lag_3', 'lag_1': 'lag_2', 'wk_10_pred_dem': 'lag_1'}, inplace=True)

X_test_wk11 = xgb.DMatrix(test_wk11.iloc[:, 3:], enable_categorical=True)

predictions_11 = model.predict(X_test_wk11)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_wk11.drop(['lag_3'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_wk11.rename(columns={'lag_2': 'lag_3', 'lag_1': 'lag_2', 'wk_10_pred_dem': 'lag_1'}, inplace=True)


In [46]:
test['wk_11_pred_dem'] = np.nan

test.loc[test['Week_num'] == 11, 'wk_11_pred_dem'] = predictions_11

In [50]:
test.reset_index(drop=True, inplace=True)

test.head()

Unnamed: 0,id,ID,Week_num,Client_ID,Product_ID,ccid_mean,ccid_median,cpid_mean,cpid_median,lag_3,lag_2,lag_1,wk_10_pred_dem,wk_11_pred_dem
0,0,25973294,11,4639078,35305,2.56,2.0,4.58,4.0,,,4.0,,2.598356
1,1,23662849,11,4705135,1238,3.15,2.0,2.0,2.0,,,,,1.378396
2,2,21257171,10,4549769,32940,7.77,4.0,2.4,2.0,,2.0,2.0,2.01327,
3,3,5334985,11,4717855,43066,2.0,1.0,1.33,1.0,,,,,0.977157
4,4,4150753,11,966351,1277,5.41,5.0,,,,,,,10.931726


In [55]:
submission = pd.DataFrame()

submission['id'] = np.arange(len(test))
submission['Demanda_uni_equil'] = test['wk_10_pred_dem'].combine_first(test['wk_11_pred_dem'])
submission.loc[submission['Demanda_uni_equil'] < 0, 'Demanda_uni_equil'] = 0

submission.head()

Unnamed: 0,id,Demanda_uni_equil
0,0,2.598356
1,1,1.378396
2,2,2.01327
3,3,0.977157
4,4,10.931726


In [57]:
len(submission[submission['Demanda_uni_equil'] == 0])

348

In [53]:
submission.to_csv("xgb_prediction_5.csv", index=False)

In [None]:
# denote categorical variables
# train['Week_num'] = train['Week_num'].astype('category')
# train['Sales_Depot_ID'] = train['Sales_Depot_ID'].astype('category')
# train['Sales_Channel_ID'] = train['Sales_Channel_ID'].astype('category')
# train['Route_ID'] = train['Route_ID'].astype('category')
# train['Client_ID'] = train['Client_ID'].astype('category')
# train['Product_ID'] = train['Product_ID'].astype('category')
# train['ID'] = train['ID'].astype('category')
# train['ccid'] = train['ccid'].astype('category')
# train['cpid'] = train['cpid'].astype('category')

In [5]:
train.drop(columns=['Sales_Depot_ID', 'Sales_Channel_ID', 'Route_ID', 'cpid_median_pct', 'ID', 'ccid', 'cpid', 'ccid_mean', 'cpid_mean'], axis=1, inplace=True)

train.head()

Unnamed: 0,Week_num,Client_ID,Product_ID,adjusted_demand,ccid_median,cpid_median
0,4,15766,325,1,4,1
1,4,15766,328,1,4,2
2,3,15766,1212,3,4,2
3,4,15766,1212,4,4,2
4,5,15766,1212,5,4,2


In [6]:
# make adjusted demand last column

# Separate the column to move from the other columns
columns = [col for col in train.columns if col != 'adjusted_demand']

# Append the column to move at the end
new_order = columns + ['adjusted_demand']

# Reorder the DataFrame columns
train = train[new_order]

train.head()

Unnamed: 0,Week_num,Client_ID,Product_ID,ccid_median,cpid_median,adjusted_demand
0,4,15766,325,4,1,1
1,4,15766,328,4,2,1
2,3,15766,1212,4,2,3
3,4,15766,1212,4,2,4
4,5,15766,1212,4,2,5


In [19]:
model = xgb.XGBRegressor(objective='reg:squaredlogerror',
                         n_estimators=25,             # Number of boosting rounds
                         learning_rate=0.1,
                         random_state=42)

In [22]:
X_train = train.iloc[:, 1:-1]
y_train = train.iloc[:, -1]

In [23]:
model.fit(X_train, y_train)

In [24]:
test.head()

Unnamed: 0,Week_num,Sales_Depot_ID,Sales_Channel_ID,Route_ID,Client_ID,Product_ID,id,ID,ccid,cpid
0,11,1110,7,3301,15766,1216,924190,3,0,16
1,11,1110,7,3301,15766,1240,4521987,6,0,20
2,10,1110,7,3301,15766,1242,6217476,7,0,21
3,10,1110,7,3301,15766,1643,4740450,10,0,25
4,10,1110,7,3301,15766,3894,970784,12,0,31


In [9]:
# get ccid, cpid medians into test data
cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())].set_index('ccid')['adj_dem_median'], index=client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())]['ccid']).to_dict()
test['ccid_median'] = test['ccid'].map(cidmapping)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(test['cpid'].unique().tolist())].set_index('cpid')['adj_dem_median'], index=product_stats[product_stats['cpid'].isin(test['cpid'].unique().tolist())]['cpid']).to_dict()
test['cpid_median'] = test['cpid'].map(pidmapping)

del cidmapping, pidmapping

In [26]:
test.head()

Unnamed: 0,Week_num,Sales_Depot_ID,Sales_Channel_ID,Route_ID,Client_ID,Product_ID,id,ID,ccid,cpid,ccid_median,cpid_median
0,11,1110,7,3301,15766,1216,924190,3,0,16,4.0,2.0
1,11,1110,7,3301,15766,1240,4521987,6,0,20,4.0,4.0
2,10,1110,7,3301,15766,1242,6217476,7,0,21,4.0,3.0
3,10,1110,7,3301,15766,1643,4740450,10,0,25,4.0,3.0
4,10,1110,7,3301,15766,3894,970784,12,0,31,4.0,5.0


In [10]:
test.drop(columns=['Sales_Depot_ID', 'Sales_Channel_ID', 'Route_ID', 'ID', 'ccid', 'cpid'], axis=1, inplace=True)

test.head()

Unnamed: 0,Week_num,Client_ID,Product_ID,id,ccid_median,cpid_median
0,11,15766,1216,924190,4.0,2.0
1,11,15766,1240,4521987,4.0,4.0
2,10,15766,1242,6217476,4.0,3.0
3,10,15766,1643,4740450,4.0,3.0
4,10,15766,3894,970784,4.0,5.0


In [11]:
test = test[['id', 'Week_num', 'Client_ID', 'Product_ID', 'ccid_median', 'cpid_median']].sort_values(by='id')

test.head()

Unnamed: 0,id,Week_num,Client_ID,Product_ID,ccid_median,cpid_median
6558101,0,11,4639078,35305,2.0,4.0
5991443,1,11,4705135,1238,2.0,2.0
5398738,2,10,4549769,32940,4.0,2.0
1383046,3,11,4717855,43066,1.0,1.0
1110570,4,11,966351,1277,5.0,


In [29]:
X_test = test.iloc[:, 2:]

In [30]:
predictions = model.predict(X_test)

In [31]:
predictions

array([2.3926792, 1.5295428, 2.1165032, ..., 2.4110427, 2.4110427,
       1.5295428], dtype=float32)

In [32]:
submission = pd.DataFrame()

submission['id'] = np.arange(len(test))
submission['Demanda_uni_equil'] = predictions

submission.head()

Unnamed: 0,id,Demanda_uni_equil
0,0,2.392679
1,1,1.529543
2,2,2.116503
3,3,0.986436
4,4,6.951692


In [33]:
submission.to_csv("xgb_prediction_1st.csv", index=False)

In [None]:


dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)

In [23]:
bst = xgb.train({'max_depth': 8, 'eta': 0.1, 'objective': 'reg:squaredlogerror'}, dtrain, num_boost_round=25)

XGBoostError: [17:59:15] C:\b\abs_90_bwj_86a\croot\xgboost-split_1724073762025\work\src\common\categorical.h:76: Invalid categorical value detected.  Categorical value should be non-negative, less than total number of categories in training data and less than 16777216

In [7]:
train['Client_ID'] = train['Client_ID'].astype('category')
train['Product_ID'] = train['Product_ID'].astype('category')

X_train = train.iloc[:, 1:-1]
y_train = train.iloc[:, -1]

dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)

In [8]:
del train

In [12]:
model2 = xgb.train({'max_depth': 8, 'eta': 0.1, 'objective': 'reg:squaredlogerror'}, dtrain, num_boost_round=100)

In [13]:
test['Client_ID'] = test['Client_ID'].astype('category')
test['Product_ID'] = test['Product_ID'].astype('category')

In [14]:
X_test = xgb.DMatrix(test.iloc[:, 2:], enable_categorical=True)

In [15]:
predictions = model2.predict(X_test)

In [26]:
submission = pd.DataFrame()

submission['id'] = np.arange(len(test))
submission['Demanda_uni_equil'] = predictions
submission[submission['Demanda_uni_equil'] < 0]['Demanda_uni_equil'] = 0

submission.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission[submission['Demanda_uni_equil'] < 0]['Demanda_uni_equil'] = 0


Unnamed: 0,id,Demanda_uni_equil
0,0,2.841015
1,1,1.466883
2,2,2.406399
3,3,0.991431
4,4,10.515233


In [24]:
submission[submission['Demanda_uni_equil'] < 0] = 0

In [28]:
submission.to_csv("xgb_prediction_4.csv", index=False)