In [62]:
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [63]:
import pandas as pd

In [5]:
data = pd.read_csv('train_reg.csv', sep=';')

In [17]:
explanatory_variables = data.drop(columns=['companhia', 'ativos'], axis=1)

In [18]:
data.dtypes  

companhia        float64
receitas         float64
ativos           float64
passivos         float64
depreciacao      float64
ganhosPorAcao    float64
dtype: object

In [19]:
data.head()

Unnamed: 0,companhia,receitas,ativos,passivos,depreciacao,ganhosPorAcao
0,1750.0,999.0,1442100000.0,999.0,999.0,1.37
1,1800.0,999.0,41247000000.0,999.0,871000000.0,2.94
2,2034.0,999.0,540778000.0,236336000.0,1522000.0,1.19
3,2098.0,109812000.0,81421370.0,38237805.0,1318357.0,1.44
4,2178.0,1944279000.0,243215000.0,90705000.0,999.0,999.0


In [20]:
data.isnull().sum()

companhia        0
receitas         0
ativos           0
passivos         0
depreciacao      0
ganhosPorAcao    0
dtype: int64

In [21]:
data = data.fillna(999)

In [22]:
data.isnull()

Unnamed: 0,companhia,receitas,ativos,passivos,depreciacao,ganhosPorAcao
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
4195,False,False,False,False,False,False
4196,False,False,False,False,False,False
4197,False,False,False,False,False,False
4198,False,False,False,False,False,False


In [23]:
target = data['ativos']

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
x_train, x_test, y_train, y_test = train_test_split(explanatory_variables, target, random_state=42, test_size=0.3)

In [26]:
x_train.shape

(2940, 4)

In [27]:
x_test.shape

(1260, 4)

**1) importing models**

In [28]:
from sklearn.tree import DecisionTreeRegressor

In [29]:
from sklearn.ensemble import RandomForestRegressor

In [30]:
from sklearn.ensemble import GradientBoostingRegressor

In [31]:
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [32]:
from xgboost import XGBRFRegressor

**2) calling objects**

In [33]:
tree = DecisionTreeRegressor(random_state=42)
random_forest = RandomForestRegressor(random_state=42)
gb = GradientBoostingRegressor(random_state=42)
xg = XGBRFRegressor(random_state=42)

**3)training model**

In [34]:
tree.fit(x_train, y_train)
random_forest.fit(x_train, y_train)
gb.fit(x_train, y_train)
xg.fit(x_train, y_train)

**4)visualizing results**

In [35]:
from sklearn.metrics import r2_score

In [45]:
r2_score(y_train, tree.predict(x_train))-r2_score(y_test, tree.predict(x_test))

0.05719633388676737

In [46]:
r2_score(y_train, random_forest.predict(x_train))-r2_score(y_test, random_forest.predict(x_test))

0.08342675603592653

In [47]:
r2_score(y_train, gb.predict(x_train))-r2_score(y_test, gb.predict(x_test))

0.08128244409916408

In [49]:
r2_score(y_train, xg.predict(x_train))-r2_score(y_test, xg.predict(x_test))

0.09608044078492117

**By comparing the difference between the power of explanation of each technique in the training and in the testing, the rational is to choose the one which shows the most similarity between the result in the training and in the testing, supported by the assumption that the desirable model should behave pratically the same in different samples.Because of that, the "estimator" should be the decision tree**


**5) choosing the model**

In [51]:
joblib.dump(tree, 'the_best_model_between_them.pkl', compress=1)

['the_best_model_between_them.pkl']

In [52]:
tree.feature_importances_

array([0.01536873, 0.97212775, 0.00698744, 0.00551608])

In [53]:
explanatory_variables.columns

Index(['receitas', 'passivos', 'depreciacao', 'ganhosPorAcao'], dtype='object')

***As expected, the value of the liabilities had the most importance in the model***

In [54]:
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

In [56]:
x_train.columns

Index(['receitas', 'passivos', 'depreciacao', 'ganhosPorAcao'], dtype='object')

**6) choosing the feature selection technique**

***The feature selection technique is needed to improve the accuracy of the model, by removing the variables which don't gives us much information about the phenomenon that is being studied.The question is: what is the most important things we need to know to understand something? The answer to this question gives us which is the explanatory variables we should include in the model.***

In [59]:
logistic_regression = LogisticRegression(random_state=42)
recursive_feature_elimination = RFE(logistic_regression, n_features_to_select=2, step=2)

**7) fitting the model**

In [60]:
recursive_feature_elimination.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**8) summarizing results**

In [61]:
recursive_feature_elimination_support = recursive_feature_elimination.get_support()
recursive_feature_elimination_features = x_train.loc[:, recursive_feature_elimination_support].columns.tolist()
recursive_feature_elimination_features

['receitas', 'passivos']

***The explanatory variables selected are the ones which had showed to have the most importance early in the model, that is, the cashflow and the liabilities***

In [64]:
lgbm = LGBMClassifier(n_estimators=200, random_state=42)

In [65]:
sfm_selector = SelectFromModel(lgbm, max_features=3)

***The LGBM Classifier is a kind of feature selection that chooses variables to be part of the model based on the assumption that the prediction error is associated with two kinds of error: the error coming from the bias of the coefficients and the error coming from the variance of the estimator associated with: "how nervous" is the phenomenon (simbolized by sigma squared of the residuals) which is bigger when you have a lot of variables to analyze, once the increase of the number of variables decrease the number of degrees of freedom that you have.Said that, this estimator chooses the coefficient estimates that minimizes a measure of these two types of errors***

In [66]:
sfm_selector.fit(x_train, y_train)

In [67]:
sfm_support = sfm_selector.get_support()

In [68]:
sfm_features = x_train.loc[:, sfm_support].columns.tolist()
sfm_features

['passivos']

***Although there is not much sense to perform a model using just one variable, the estimator suggested that only the liabilities are needed to forecast de variable of interest***

In [69]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

### ***Tunning of Hiperparameters***

In [76]:
from sklearn.tree import DecisionTreeClassifier

**1) creating a dictionary defining where the algoritm should go**

In [77]:
decision_tree_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [1,5,10,20,50,100],
    'min_samples_leaf': [1,10,20,50]
}

**2) creating a tunning object**

In [78]:
decisiontree = DecisionTreeClassifier(random_state=42)

decision_tree_grid = GridSearchCV(decisiontree, decision_tree_grid)

**3) training the model**

In [80]:
decision_tree_grid.fit(x_train, y_train)



In [81]:
decision_tree_grid.best_estimator_

In [85]:
decision_tree_grid.best_params_

{'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 1}

**The "advice" of the model by choosing "entropy" instead of "gini" suggest that we need an architecture with more layers which will allow us to have more accuracy in the answers we get**

In [86]:
decision_tree_grid.best_score_

0.03707482993197279