In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv (r'C:\Afif\SEM4\Data Analytics\train.csv')   
test = pd.read_csv (r'C:\Afif\SEM4\Data Analytics\test.csv')   

In [2]:
train.head

<bound method NDFrame.head of              id                                          url_legal  \
0     c12129c31                                                NaN   
1     85aa80a4c                                                NaN   
2     b69ac6792                                                NaN   
3     dd1000b26                                                NaN   
4     37c1b32fb                                                NaN   
...         ...                                                ...   
2829  25ca8f498  https://sites.ehe.osu.edu/beyondpenguins/files...   
2830  2c26db523  https://en.wikibooks.org/wiki/Wikijunior:The_E...   
2831  cd19e2350  https://en.wikibooks.org/wiki/Wikijunior:The_E...   
2832  15e2e9e7a  https://en.wikibooks.org/wiki/Geometry_for_Ele...   
2833  5b990ba77  https://en.wikibooks.org/wiki/Wikijunior:Biolo...   

           license                                            excerpt  \
0              NaN  When the young people returned to th

In [3]:
train.shape

(2834, 6)

In [4]:
test.shape

(7, 4)

In [5]:
train.isnull().any()

id                False
url_legal          True
license            True
excerpt           False
target            False
standard_error    False
dtype: bool

In [6]:
import nltk

# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# clean text data
train['clean_text'] = train['excerpt'].apply(lambda x: clean_text(x))

In [7]:
# select 2 columns
df = train[['clean_text','target']]
df.head(3)

Unnamed: 0,clean_text,target
0,young people return ballroom present decidedly...,-0.340259
1,dinner time mrs fayre somewhat silent eye rest...,-0.315372
2,roger predict snow departed quickly come two d...,-0.580118


In [8]:
dfX= df['clean_text']
dfY=df['target']

## CountVectorizer for Pipeline (Count)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(dfX)
X_train_vectorized = vect.transform(dfX)
len(vect.get_feature_names())


7115

# TFIDF transformer

In [10]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(X_train_vectorized)

tfidf_matrix =  tfidf_transformer.fit_transform(X_train_vectorized)

## Split Data

In [11]:
from sklearn.model_selection import train_test_split

train_X2, val_X2, train_y2, val_y2 = train_test_split(tfidf_matrix, dfY, test_size = 0.3, random_state=71)

In [15]:
lgb_train = lgb.Dataset(train_X2, train_y2)
lgb_eval = lgb.Dataset(val_X2, val_y2, reference=lgb_train)

## Model 0 : LGB Model with same parameters in baseline

In [12]:
import lightgbm as lgb

In [17]:
params = {
    'objective': 'regression',   # 回帰問題
    'metric': 'rmse',            # RMSEで評価
}
lgbModel = lgb.train(params, lgb_train, valid_sets=lgb_eval,
                     verbose_eval=100,  # 50イテレーション毎に学習結果出力
                     num_boost_round=1000,  # 最大イテレーション回数指定
                     early_stopping_rounds=500,
                    )

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 31813
[LightGBM] [Info] Number of data points in the train set: 1983, number of used features: 1364
[LightGBM] [Info] Start training from score -0.946486
Training until validation scores don't improve for 500 rounds
[100]	valid_0's rmse: 0.821454
[200]	valid_0's rmse: 0.830269
[300]	valid_0's rmse: 0.834782
[400]	valid_0's rmse: 0.837365
[500]	valid_0's rmse: 0.838521
Early stopping, best iteration is:
[85]	valid_0's rmse: 0.819103


## Model 1 : LGB Model

In [86]:
import lightgbm as lgb

In [88]:
lgModel2 = lgb.LGBMRegressor().fit(train_X2, train_y2)
y_pred = lgModel2.predict(val_X2)
rmse = np.sqrt(metrics.mean_squared_error(val_y2, y_pred))
rmse

0.824240571071137

## Model 2 : Linear

In [89]:
from sklearn.linear_model import LinearRegression

In [91]:
lrModel = LinearRegression().fit(train_X2, train_y2)
y_pred = lrModel.predict(val_X2)
rmse = np.sqrt(metrics.mean_squared_error(val_y2, y_pred))
rmse

0.9244243751849708

## Model 3 : Ridge 

In [95]:
from sklearn.linear_model import Ridge

ridge = Ridge()
ridge.fit(train_X2,train_y2)

Ridge()

In [96]:
y_pred2 = ridge.predict(val_X2)
rmse = np.sqrt(metrics.mean_squared_error(val_y2, y_pred2))
rmse

0.7596590967572867

## Model 4 : Decision Tree

In [93]:
from sklearn.tree import DecisionTreeRegressor 

In [94]:
dtModel = DecisionTreeRegressor().fit(train_X2, train_y2)
y_pred = dtModel.predict(val_X2)
rmse = np.sqrt(metrics.mean_squared_error(val_y2, y_pred))
rmse

1.2227667016879396

## Model 5 : Random Forest

In [82]:
from sklearn.ensemble import RandomForestRegressor
model_rf = RandomForestRegressor()
model_rf.fit(train_X2,train_y2)

RandomForestRegressor()

In [83]:
from sklearn import metrics

y_pred = model_rf.predict(val_X2)
rmse = np.sqrt(metrics.mean_squared_error(val_y2, y_pred))
rmse

0.8569939115230927

## Hyper tuning

In [97]:
from pprint import pprint

ridge_tuning = Ridge()

print('Parameters currently in use:\n')
pprint(ridge_tuning.get_params())

Parameters currently in use:

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'normalize': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.001}


In [107]:
from sklearn.model_selection import RandomizedSearchCV

# alpha : range
alpha = [0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 1.0, 1.3, 1.5]


# fit_intercept
fit_intercept = [True , False]


# max_iter
max_iter = [50, 100, 300, 500, 700, 1000, None]


# normalize
normalize = [True , False]

# solver
solver = ['auto','sparse_cg']


# Create the random grid
random_grid = {'alpha': alpha,  
               'fit_intercept': fit_intercept,
               'max_iter': max_iter,
               'normalize': normalize,
               'solver': solver}


pprint(random_grid)

{'alpha': [0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 1.0, 1.3, 1.5],
 'fit_intercept': [True, False],
 'max_iter': [50, 100, 300, 500, 700, 1000, None],
 'normalize': [True, False],
 'solver': ['auto', 'sparse_cg']}


In [108]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
r_random = RandomizedSearchCV(estimator = ridge_tuning, param_distributions = random_grid, scoring="neg_mean_squared_error", n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
r_random.fit(train_X2,train_y2)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    2.1s finished


RandomizedSearchCV(cv=3, estimator=Ridge(), n_iter=100, n_jobs=-1,
                   param_distributions={'alpha': [0.001, 0.01, 0.1, 0.3, 0.5,
                                                  0.7, 1.0, 1.3, 1.5],
                                        'fit_intercept': [True, False],
                                        'max_iter': [50, 100, 300, 500, 700,
                                                     1000, None],
                                        'normalize': [True, False],
                                        'solver': ['auto', 'sparse_cg']},
                   random_state=42, scoring='neg_mean_squared_error',
                   verbose=2)

In [109]:
r_random.best_params_

{'solver': 'auto',
 'normalize': False,
 'max_iter': 1000,
 'fit_intercept': True,
 'alpha': 1.0}

In [None]:
{'solver': 'auto',
 'normalize': True,
 'max_iter': 500,
 'fit_intercept': True,
 'alpha': 2.5}

In [None]:
{'solver': 'auto',
 'normalize': False,
 'max_iter': 1000,
 'fit_intercept': True,
 'alpha': 1.0}

In [112]:
base_model = Ridge(random_state = 42)
base_model.fit(train_X2,train_y2)
pred_base = base_model.predict(val_X2)
rmse_base = np.sqrt(metrics.mean_squared_error(val_y2, pred_base))
print("RMSE Base: ",rmse_base)

best_random = Ridge(alpha = 2.5, solver = 'auto', normalize = True, max_iter = 500, fit_intercept = True, random_state = 42)
best_random.fit(train_X2,train_y2)
pred_random = best_random.predict(val_X2)
rmse_random = np.sqrt(metrics.mean_squared_error(val_y2, pred_random))
print("RMSE Random: ",rmse_random)

print('Improvement of {:0.2f}%.'.format( 100 * (rmse_base - rmse_random) / rmse_base))

RMSE Base:  0.7596590967572867
RMSE Random:  0.7567051108744559
Improvement of 0.39%.


## GRID SEARCH CV

In [121]:
from sklearn.model_selection import GridSearchCV

# alpha : range
alpha = [0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 1.0, 1.3, 1.5,2.5,5.0,10]


# fit_intercept
fit_intercept = [True , False]


# max_iter
max_iter = [50, 100, 300, 500, 700, 1000, None]


# normalize
normalize = [True , False]

# solver
solver = ['auto','sparse_cg']


# Create the random grid
search_grid = {'alpha': alpha,
               'fit_intercept': fit_intercept,
               'max_iter': max_iter,
               'normalize': normalize,
               'solver': solver}


pprint(search_grid)

{'alpha': [0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 1.0, 1.3, 1.5, 2.5, 5.0, 10],
 'fit_intercept': [True, False],
 'max_iter': [50, 100, 300, 500, 700, 1000, None],
 'normalize': [True, False],
 'solver': ['auto', 'sparse_cg']}


In [122]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
r_grid = GridSearchCV(estimator = ridge_tuning, param_grid = search_grid, scoring="neg_mean_squared_error", cv = 3, verbose=2,  n_jobs = -1)
# Fit the random search model
r_grid.fit(train_X2,train_y2)

Fitting 3 folds for each of 672 candidates, totalling 2016 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 228 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 634 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 1744 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 2016 out of 2016 | elapsed:   16.1s finished


GridSearchCV(cv=3, estimator=Ridge(), n_jobs=-1,
             param_grid={'alpha': [0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 1.0, 1.3,
                                   1.5, 2.5, 5.0, 10],
                         'fit_intercept': [True, False],
                         'max_iter': [50, 100, 300, 500, 700, 1000, None],
                         'normalize': [True, False],
                         'solver': ['auto', 'sparse_cg']},
             scoring='neg_mean_squared_error', verbose=2)

In [123]:
r_grid.best_params_

{'alpha': 2.5,
 'fit_intercept': True,
 'max_iter': 50,
 'normalize': True,
 'solver': 'auto'}

In [124]:
base_model = Ridge(random_state = 42)
base_model.fit(train_X2,train_y2)
pred_base = base_model.predict(val_X2)
rmse_base = np.sqrt(metrics.mean_squared_error(val_y2, pred_base))
print("RMSE Base: ",rmse_base)

best_grid = Ridge(alpha = 2.5, solver = 'auto', normalize = True, max_iter = 50, fit_intercept = True, random_state = 42)
best_grid.fit(train_X2,train_y2)
pred_grid = best_grid.predict(val_X2)
rmse_grid = np.sqrt(metrics.mean_squared_error(val_y2, pred_grid))
print("RMSE Grid: ",rmse_grid)

print('Improvement of {:0.2f}%.'.format( 100 * (rmse_base - rmse_grid) / rmse_base))

RMSE Base:  0.7596590967572867
RMSE Grid:  0.7567051108744559
Improvement of 0.39%.


## GRID SEARCH CV LEVEL 2

In [150]:
from sklearn.model_selection import GridSearchCV

# alpha : range
alpha = [2.3,2.4,2.5,2.6]


# fit_intercept
fit_intercept = [True , False]


# max_iter
max_iter = [50,60,70 , None]


# normalize
normalize = [True , False]

# solver
solver = ['auto','sparse_cg']


# Create the random grid
search_grid = {'alpha': alpha,
               'fit_intercept': fit_intercept,
               'max_iter': max_iter,
               'normalize': normalize,
               'solver': solver}


pprint(search_grid)

{'alpha': [2.3, 2.4, 2.5, 2.6],
 'fit_intercept': [True, False],
 'max_iter': [50, 60, 70, None],
 'normalize': [True, False],
 'solver': ['auto', 'sparse_cg']}


In [151]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
r_grid = GridSearchCV(estimator = ridge_tuning, param_grid = search_grid, scoring="neg_mean_squared_error", cv = 3, verbose=2,  n_jobs = -1)
# Fit the random search model
r_grid.fit(train_X2,train_y2)

Fitting 3 folds for each of 128 candidates, totalling 384 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 384 out of 384 | elapsed:    1.9s finished


GridSearchCV(cv=3, estimator=Ridge(), n_jobs=-1,
             param_grid={'alpha': [2.3, 2.4, 2.5, 2.6],
                         'fit_intercept': [True, False],
                         'max_iter': [50, 60, 70, None],
                         'normalize': [True, False],
                         'solver': ['auto', 'sparse_cg']},
             scoring='neg_mean_squared_error', verbose=2)

In [152]:
r_grid.best_params_

{'alpha': 2.6,
 'fit_intercept': True,
 'max_iter': 50,
 'normalize': True,
 'solver': 'auto'}

In [153]:
base_model = Ridge(random_state = 42)
base_model.fit(train_X2,train_y2)
pred_base = base_model.predict(val_X2)
rmse_base = np.sqrt(metrics.mean_squared_error(val_y2, pred_base))
print("RMSE Base: ",rmse_base)

best_grid = Ridge(alpha = 2.6, solver = 'auto', normalize = True, max_iter = 50, fit_intercept = True, random_state = 42)
best_grid.fit(train_X2,train_y2)
pred_grid = best_grid.predict(val_X2)
rmse_grid = np.sqrt(metrics.mean_squared_error(val_y2, pred_grid))
print("RMSE Grid: ",rmse_grid)

print('Improvement of {:0.2f}%.'.format( 100 * (rmse_base - rmse_grid) / rmse_base))

RMSE Base:  0.7596590967572867
RMSE Grid:  0.7568998143139294
Improvement of 0.36%.
