In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter('ignore')

In [None]:
train_df = pd.read_csv("/kaggle/input/playground-series-s3e9/train.csv")
train_df.drop("id",inplace=True,axis=1)
add_df = pd.read_csv("/kaggle/input/predict-concrete-strength/ConcreteStrengthData.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s3e9/test.csv")
test_df.drop("id",inplace=True,axis=1)

In [None]:
print(train_df.shape)
train_df.head()

In [None]:
train_df["CementComp"] = train_df["CementComponent"]
train_df["Flag"] = 1
train_df.drop("CementComponent",inplace=True,axis=1)

In [None]:
print(add_df.shape)
add_df.head()

In [None]:
add_df["CementComp"] = add_df.iloc[:,0]
add_df["Flag"] = 0
add_df = add_df.iloc[:,1:]

In [None]:
train_df = pd.concat([train_df,add_df],ignore_index=True)

In [None]:
train_df

In [None]:
train_df.isnull().sum()

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
ncols = 2
nrows = np.ceil(len(train_df.columns)/ncols).astype(int)
fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(12,nrows*2.5))
for c, ax in zip(train_df.columns, axs.flatten()):
    sns.histplot(train_df, x=c, ax=ax)
fig.suptitle('Distribution of all variables', fontsize=20)
plt.tight_layout(rect=[0, 0, 1, 0.98])

In [None]:
test_df["CementComp"] = test_df["CementComponent"]
test_df.drop("CementComponent",inplace=True,axis=1)
test_df["Flag"] = 1

In [None]:
TARGET = "Strength"
sns.histplot(train_df[TARGET])

In [None]:
columns = ["BlastFurnaceSlag","FlyAshComponent","WaterComponent","SuperplasticizerComponent","CoarseAggregateComponent",
           "FineAggregateComponent","AgeInDays","CementComp","Flag"]

In [None]:
ncols = 2
nrows = np.ceil((len(test_df.columns)-1)/ncols).astype(int)
for c, ax in zip(train_df[columns], axs.flatten()):
    sns.jointplot(data=train_df, x=c, y=TARGET, ax=ax)

In [None]:
k = 15 
corrmat = train_df.corr()
cols = corrmat.nlargest(k, TARGET)[TARGET].index
cm = np.corrcoef(train_df[cols].values.T)
fig, ax = plt.subplots(figsize=(12, 10))
sns.set(font_scale=1.2,font="IPAexGothic")
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt=".2f", annot_kws={"size": 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor

In [None]:
X = train_df[columns]
y = train_df[TARGET]

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2,random_state=0)

In [None]:
lgb_params = {
        'objective':'regression', 
        'metric': 'rmse', 
        'task': 'train', 
        'seed': 42,
        'n_estimators':2000
        }

model_lgb = lgb.LGBMRegressor(**lgb_params)
model_lgb.fit(X_train,
              y_train,
              eval_set=[(X_train, y_train),(X_valid, y_valid)],
              verbose=10,
              early_stopping_rounds=100)

In [None]:
xgb_params = {
        'objective': 'reg:squarederror',
        'seed':42,
        'n_estimators':2000
        }

model_xgb = xgb.XGBRegressor(**xgb_params)
model_xgb.fit(X_train,
              y_train,
              eval_set=[(X_train, y_train),(X_valid, y_valid)],
              early_stopping_rounds=100,
              verbose=100)

In [None]:
lgb.plot_importance(model_lgb, figsize=(12,8), max_num_features=50, importance_type='gain')
plt.tight_layout()
plt.show()
plt.close()

In [None]:
sub = pd.read_csv('/kaggle/input/playground-series-s3e9/sample_submission.csv')
sub

In [None]:
y_xgb = model_xgb.predict(test_df)
sub[TARGET] = y_xgb
sub.to_csv('/kaggle/working/xgb3.csv',index=False)

In [None]:
y_lgb = model_lgb.predict(test_df)
sub[TARGET] = y_lgb
sub.to_csv('/kaggle/working/lgb3.csv',index=False)

In [None]:
sns.histplot(sub[TARGET])