In [None]:
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
import seaborn as sns
import datatable as dt
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import validation_curve
import fastai
from fastai import *
from fastai.tabular.all import *
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn import preprocessing
import sklearn.model_selection as cv
import xgboost as xgb
from xgboost import XGBRegressor
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
orders_df = pd.read_csv("/kaggle/input/brazilian-ecommerce/olist_order_items_dataset.csv")
orders_df

In [None]:
len(set(orders_df.seller_id))   # there are 3095 sellers

In [None]:
quantity_df = orders_df.groupby(['order_id', 'product_id','seller_id','price'])['order_item_id'].sum().reset_index()
#quantity_df = orders_df.groupby(['order_id', 'product_id','seller_id','price'])['order_item_id'].agg({"quantity":"max"}).reset_index()
quantity_df['order_price'] = quantity_df['price']*quantity_df['order_item_id']
quantity_df

# get the revennue for each seller

In [None]:
#multiple product quantity summary
tdf= quantity_df.order_item_id.value_counts().sort_values()
tdf = pd.DataFrame({'order_item_id':tdf.index, 'order_counts':tdf.values}).sort_values("order_counts",ascending=False) 
tdf.head()

In [None]:
quantity_df[quantity_df["order_item_id"]>1]

In [None]:
#Aggregating total revenue per seller
total_revenue_df = quantity_df.groupby(['seller_id'])['order_price'].agg('sum').reset_index()
total_revenue_df

In [None]:
# read another dataset about all the order information, clean the data
olist = pd.read_csv("../input/merge-all-olist/olist.csv")
olist = olist.drop(columns = 'Unnamed: 0')
olist = olist.loc[:,["seller_id","review_score","price", "freight_value"]]
olist

In [None]:
# get the merged dataset abouut the sellers revenue information
total_revenue_df = pd.DataFrame(olist.merge(total_revenue_df, on = "seller_id", how="left").groupby("seller_id")["order_price"].sum())

# clean the data

In [None]:
for index, item in enumerate(olist["review_score"]):
    if (item == "MG") or (item == "SP"):
        olist["review_score"][index] = 0

In [None]:
for index, item in enumerate(olist["price"]):
    if (item == "PR") or (item == "SP"):
        olist["price"][index] = 0

In [None]:
for index, item in enumerate(olist["freight_value"]):
    if (item == "health_beauty") or (item == "SP") or (item == "perfumery"):
        olist["freight_value"][index] = 0

In [None]:
olist["review_score"] = pd.to_numeric(olist.review_score)
olist["price"] = pd.to_numeric(olist.price)
olist["freight_value"] = pd.to_numeric(olist.freight_value)

# merge the olist dataset with total revenue dataset

In [None]:
olist = olist.groupby("seller_id")[["review_score", "price","freight_value"]].sum()
olist = olist.merge(total_revenue_df, on ="seller_id", how = "inner")
olist = olist.rename(columns = {"order_price":"revenue"})
olist

# Making predictions on revenue

In [None]:
feature = ["review_score", "price","freight_value"]
X = olist[feature]
y = olist.iloc[:, -1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state= 42,
    #stratify=y
)

# Model Evaluation

In [None]:
# define a regression evaluation function
def rr(y_true, y_pred, model_name):
    import pandas as pd
    import sklearn.metrics as metrics
    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)    
    l = [{'explained_variance' : round(explained_variance,4),
                       'mean_squared_log_error' : round(mean_squared_log_error,4),
                       'r2' : round(r2,4),
                       'MAE' : round(mean_absolute_error,4),
                       'MSE' : round(mse,4),
                       'RMSE' : round(np.sqrt(mse), 4)}]
    df = pd.DataFrame(l).T
    df = df.apply(lambda x: '%.5f' % x, axis = 1)
    df = pd.DataFrame(df, columns = [model_name])
    return df

# RandomForestRegressor

In [None]:
n_estimators= [100,200,300]
max_depth = [3,4,5,6,10]
# Create the random grid
grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
}
m =  RandomForestRegressor()
mrf = RandomizedSearchCV(estimator = m, 
                              param_distributions = grid, 
                              n_iter = 20, 
                              cv = 3, 
                              verbose=2, 
                              random_state= 42, 
                              scoring='neg_root_mean_squared_error',
                              n_jobs = -1)
mrf.fit(X_train, y_train)
print("score: ", mrf.best_score_)
print("best estimator parameters: ", mrf.best_estimator_.get_params())


In [None]:
y_predrf = mrf.best_estimator_.predict(X_test)

# XGBoostRegressor

In [None]:
n_estimators= [100,200,300]
max_depth = [3,4,5,6,10]
learning_rate = [0.03,0.3]
subsample = [0.5,0.7,1]
# Create the random grid
grid = {'n_estimators': n_estimators,
    'max_depth': max_depth,
                'learning_rate' : learning_rate,
                'subsample' : subsample,
}
mxgb = XGBRegressor()
m_randomxgb = RandomizedSearchCV(estimator = mxgb, 
                              param_distributions = grid, 
                              n_iter = 90, 
                              cv = 3, 
                              verbose=2, 
                              random_state= 42, 
                              scoring='neg_root_mean_squared_error',
                              n_jobs = -1)
m_randomxgb.fit(X_train, y_train)
print(m_randomxgb.best_params_)
del m_randomxgb
del mxgb

In [None]:
mxgb = xgb.XGBRegressor(
    n_estimators = 300,
    max_depth = 5 ,
    learning_rate= 0.03,
    subsample= 0.5 ,
    random_state= 42,
    tree_method='gpu_hist',
    n_jobs = -1
)
mxgb.fit(X_train,y_train)
y_predxgb = mxgb.predict(X_test)

# LinearRegression

In [None]:
from sklearn.linear_model import LinearRegression
mlr = LinearRegression(n_jobs = -1)
mlr.fit(X_train, y_train)
y_predlr = mlr.predict(X_test)
for index, item in enumerate(y_predlr):
    if item < 0:
        y_predlr[index] = 0
y_predlr

# GradientBoostingRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
n_estimators= [100,200,300]
max_depth = [3,4,5,6,10]
learning_rate = [0.03,0.3]
subsample = [0.5,0.7,1]
# Create the random grid
grid = {'n_estimators': n_estimators,
    'max_depth': max_depth,
                'learning_rate' : learning_rate,
                'subsample' : subsample,
}
mgb = GradientBoostingRegressor()
m_randomgb = RandomizedSearchCV(estimator = mgb, 
                              param_distributions = grid, 
                              n_iter = 90, 
                              cv = 3, 
                              verbose=2, 
                              random_state= 42, 
                              scoring='neg_root_mean_squared_error',
                              n_jobs = -1)
m_randomgb.fit(X_train, y_train)
y_predgb = m_randomgb.best_estimator_.predict(X_test)

In [None]:
for index, item in enumerate(y_predgb):
    if item < 0:
        y_predgb[index] = 0

# Bagging

In [None]:
y_bagging = 0.2 * y_predrf + 0.4 * y_predxgb + 0.1 * y_predlr + 0.3 * y_predgb

# ----------------------------

In [None]:
import lightgbm as lgb
params = {
    'boosting_type': 'gbdt', 
    'objective': 'regression', 

    'learning_rate': 0.1, 
    'num_leaves': 50, 
    'max_depth': 6,

    'subsample': 0.8, 
    'colsample_bytree': 0.8, 
    }
data_train = lgb.Dataset(X_train, y_train, silent=True)
cv_results = lgb.cv(
    params, data_train, num_boost_round=1000, nfold=5, stratified=False, shuffle=True, metrics='rmse',
    early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0)

print('best n_estimators:', len(cv_results['rmse-mean']))
print('best cv score:', cv_results['rmse-mean'][-1])

In [None]:
df_train = X_train

In [None]:
from sklearn.model_selection import GridSearchCV

model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50,
                              learning_rate=0.1, n_estimators=9, max_depth=6,
                              metric='rmse', bagging_fraction = 0.8,feature_fraction = 0.8)

params_test1={
    'max_depth': range(3,8,2),
    'num_leaves':range(50, 170, 30)
}
gsearch1 = GridSearchCV(estimator=model_lgb, param_grid=params_test1, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
gsearch1.fit(df_train, y_train)
gsearch1.best_params_, gsearch1.best_score_

In [None]:
params_test2={
    'max_depth': [6,7,8],
    'num_leaves':[38,44,50,56,62]
}

gsearch2 = GridSearchCV(estimator=model_lgb, param_grid=params_test2, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
gsearch2.fit(df_train, y_train)
gsearch2.best_params_, gsearch2.best_score_

In [None]:
params_test3={
    'min_child_samples': [18, 19, 20, 21, 22],
    'min_child_weight':[0.001, 0.002]
}
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50,
                              learning_rate=0.1, n_estimators=9, max_depth=7, 
                              metric='rmse', bagging_fraction = 0.8, feature_fraction = 0.8)
gsearch3 = GridSearchCV(estimator=model_lgb, param_grid=params_test3, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
gsearch3.fit(df_train, y_train)
gsearch3.best_params_, gsearch3.best_score_

In [None]:
params_test4={
    'feature_fraction': [0.5, 0.6, 0.7, 0.8, 0.9],
    'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0]
}
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50,
                              learning_rate=0.1, n_estimators=9, max_depth=7, 
                              metric='rmse', bagging_freq = 5,  min_child_samples= 19)
gsearch4 = GridSearchCV(estimator=model_lgb, param_grid=params_test4, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
gsearch4.fit(df_train, y_train)
gsearch4.best_params_, gsearch4.best_score_

In [None]:
params_test5={
    'feature_fraction': [0.82, 0.85, 0.88, 0.9, 0.92, 0.95, 0.98 ]
}
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves= 50,
                              learning_rate=0.1, n_estimators= 9, max_depth=7, 
                              metric='rmse',  min_child_samples=19)
gsearch5 = GridSearchCV(estimator=model_lgb, param_grid=params_test5, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
gsearch5.fit(df_train, y_train)
gsearch5.best_params_, gsearch5.best_score_

In [None]:
params_test6={
    'reg_alpha': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5],
    'reg_lambda': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5]
}
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50,
                              learning_rate=0.1, n_estimators=9, max_depth=7, 
                              metric='rmse',  min_child_samples=19, feature_fraction=0.85)
gsearch6 = GridSearchCV(estimator=model_lgb, param_grid=params_test6, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
gsearch6.fit(df_train, y_train)
gsearch6.best_params_, gsearch6.best_score_

In [None]:
params = {
    'boosting_type': 'gbdt', 
    'objective': 'regression', 

    'learning_rate': 0.005, 
    'num_leaves': 50, 
    'max_depth': 9,
    'min_data_in_leaf': 19,

    'subsample': 1, 
    'colsample_bytree': 0.85, 
    }

data_train = lgb.Dataset(df_train, y_train, silent=True)
cv_results = lgb.cv(
    params, data_train, num_boost_round=10000, nfold=5, stratified=False, shuffle=True, metrics='rmse',
    early_stopping_rounds=50, verbose_eval=100, show_stdv=True)

print('best n_estimators:', len(cv_results['rmse-mean']))
print('best cv score:', cv_results['rmse-mean'][-1])

In [None]:
lg = lgb.LGBMRegressor(**params, n_estimators = 267)

In [None]:
lg.fit(X_train, y_train)

In [None]:
y_predlgb = lg.predict(X_test)

# ---------------------

In [None]:
rfpred = rr(y_test, y_predrf, "RandomForestRegressor")
xgbpred  = rr(y_test, y_predxgb, "XGBoostRegressor")
lrpred = rr(y_test, y_predlr, "LinearRegression")
gbpred = rr(y_test, y_predgb, "GradientBoostingRegressor")
baggingpred = rr(y_test, y_bagging, "0.2 * RF + 0.4 * XGB + 0.1 * LR + 0.3 * GB")
df = pd.concat([rfpred, xgbpred, lrpred, gbpred, baggingpred], axis= 1).T
df.reset_index(inplace=True)
df = df.rename(columns = {'index':'model'})
df


In [None]:
for index,item in enumerate(y_bagging):
    y_bagging[index] = round(item, 2)

In [None]:
x = 1.357e-02
x = float(str(x))
x

In [None]:
for index,item in enumerate(y_bagging):
    y_bagging[index] = round(item, 2)

In [None]:
pd.DataFrame({"test_data": y_test, "bagging_result": y_bagging}).iloc[[104,116,118,2,204,306,341,335,334,333],:]

In [None]:
df.iloc[:,1:7] = df.iloc[:,1:7].apply(lambda x: pd.to_numeric(x))

In [None]:
df

In [None]:
sns.set(rc={'figure.figsize':(20,8)})

In [None]:
sns.barplot(data = df, x="model", y='RMSE', hue = "model", order = df.sort_values('RMSE').model)

In [None]:
check = pd.DataFrame({"y_test" : y_test, "y_predxgb": y_predxgb})
check["y_predxgb"] = check.y_predxgb.apply(lambda x: round(x,2))
check.head(20)

In [None]:
# close = pd.read_csv("/kaggle/input/marketing-funnel-olist/olist_closed_deals_dataset.csv")
# marketing = pd.read_csv("/kaggle/input/marketing-funnel-olist/olist_marketing_qualified_leads_dataset.csv")

In [None]:
# funnel_df = marketing.merge(close, on = "mql_id", how = "left")
# funnel_df

In [None]:
# #merging funnel df and total revenue df (379 out of 841 leads have seller data)
# funnel_df.first_contact_date = pd.to_datetime(funnel_df.first_contact_date) #normalizing contact date
# funnel_df["contact_month"] = funnel_df.first_contact_date.dt.month
# final_df_left = funnel_df.merge(total_revenue_df, on='seller_id', how="left")
# final_df_inner = funnel_df.merge(total_revenue_df, on='seller_id', how="inner")


# final_df_inner

In [None]:
# feature_imp = pd.Series(rf_model.feature_importances_,index=features).sort_values(ascending=False)
# feature_imp

In [None]:
olist data['english'] = data['review_comment_message'].apply(translator.translate, src='pt', dest='en').apply(getattr, args=('text',))

In [None]:
pip install googletrans==4.0.0-rc1

In [None]:
import pandas as pd
import time
#from google_trans_new import google_translator  
#translator = google_translator() 

from googletrans import Translator

 
review = pd.read_csv("/kaggle/input/brazilian-ecommerce/olist_order_reviews_dataset.csv")
t = review.review_comment_message.fillna("no comment")
t = t.drop(list(range(2171)))
t = t.drop(list(range(2171,3932)))
t = t.drop(list(range(3932,4342)))
t = t.drop(list(range(4342,5342)))
t = t.drop(list(range(5342,6513)))  
t = t.drop(list(range(6513,7513)))
t = t.drop(list(range(7513,8684)))  #1171
t = t.drop(list(range(8684, 10000)))  #1316
t = t.drop(list(range(10000, 12297)))
t = t.drop(list(range(12297, 17156)))
t = t.drop(list(range(17156, 17406)))
t = t.drop(list(range(17406, 27181)))
# english = []
# for i in t.head(1316):
#     english.append(translator.translate(i, lang_tgt='en' ))
# pd.DataFrame(english).to_csv('english.csv', index= False)
t = t.reset_index().drop(columns = "index")
t = t.review_comment_message.values
for index, item in enumerate(t):
    if ("/r" in item) or ("/n" in item) or ("\n" in item) or ("\r" in item):
        t[index] = "no comment"

In [None]:
english = []
translator = Translator()
for i in t:
    english.append(translator.translate(i).text)
    time.sleep(0.4)


In [None]:
len(english)

In [None]:
pd.DataFrame(english).to_csv('english.csv', index= False)