In [1]:
import pandas as pd
import numpy as np
# Import the LinearRegression class from scikit-learn's linear_model module
from sklearn.linear_model import LinearRegression
# Import the DecisionTreeRegressor class from scikit-learn's linear_model module
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# Import the split function from sklearn
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
# Load your dataset
df = pd.read_csv('12411-0003_$F.csv')

In [3]:
# View the first five rows 
df.head()

Unnamed: 0,date,male,female,total
0,31.12.1970,29071621,31929543,61001164
1,31.12.1971,29367427,32135076,61502503
2,31.12.1972,29533254,32276124,61809378
3,31.12.1973,29713753,32387616,62101369
4,31.12.1974,29604450,32387025,61991475


## Data Cleaning

In [4]:
# Extracting just the year column from the date
df['year'] = df['date'].apply(lambda x:x[-4:])

# Rearrange the columns you need for the machine learning
df = df[['year', 'male', 'female', 'total']]
df.head()

Unnamed: 0,year,male,female,total
0,1970,29071621,31929543,61001164
1,1971,29367427,32135076,61502503
2,1972,29533254,32276124,61809378
3,1973,29713753,32387616,62101369
4,1974,29604450,32387025,61991475


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   year    54 non-null     object
 1   male    54 non-null     int64 
 2   female  54 non-null     int64 
 3   total   54 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 1.8+ KB


## Machine Learning

### Preparing your Training and Testing data

In [6]:
# Split your data into Response and Feature variables 
X = df['year'] # feature
y = df['male'] # response

In [7]:
# Confirming the shape of the Response and Feature Varriables
print(f'The shape of the response (y) variable is: {y.shape}\n' +
        f'The shape of the feature (X) variable is: {X.shape}')

The shape of the response (y) variable is: (54,)
The shape of the feature (X) variable is: (54,)


In [8]:
# Reshaping the X_train data to fit into the regression model
X_reshaped = np.array(X).reshape(-1,1)

In [9]:
# Call the 'train_test_split' function:
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=25)

### Model Evalution 1 - Linear Regression

In [10]:
# Initialise the LinearRegression model
lm = LinearRegression()

In [11]:
# Fit the model to the training data 
lm.fit(X_train, y_train)

In [12]:
# Extract the intercept, or y-cut, of our linear model
c = float(lm.intercept_)
# Extract the coefficient, or gradient, of our linear model
m = lm.coef_[0]

In [13]:
# Displaying the Slope and Intercept on the regression model
print(f"Slope:\t\t {m}\nIntercept:\t {c}")

Slope:		 281839.68433727697
Intercept:	 -526718974.4684461


In [14]:
# Generate the values that fall along our regression line
gen_y_train = lm.predict(X_train)

In [15]:
print("Training:")
# Calculate the mean-squared-error
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_train, gen_y_train)))
# Calculate the R-squared metric
print('R_squared:', metrics.r2_score(y_train, gen_y_train))

Training:
RMSE: 2574801.706727201
R_squared: 0.7636834195196048


In [16]:
# Generate values of y from x, using the linear model
gen_y_test = lm.predict(X_test)

In [17]:
print("Testing:")
# Calculate the mean-squared-error
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, gen_y_test)))
# Calculate the R-squared metric
print('R_squared:', metrics.r2_score(y_test, gen_y_test))

Testing:
RMSE: 2506329.2650252185
R_squared: 0.7234499424778538


### Model Evaluation 2 - Decision Tree Regressor

In [18]:
# Instantiate regression tree model
regr_tree = DecisionTreeRegressor(max_depth=2,random_state=42)

In [19]:
# Fit the data to the regression tree model
regr_tree.fit(X_train,y_train)

In [20]:
# Generate the values that fall along our descision tree line
gen_y_train =  regr_tree.predict(X_train)

In [21]:
print("Trainnig :")
# Calculate the mean-squared-error
print("RMSE :", np.sqrt(metrics.mean_squared_error(gen_y_train, y_train)))
# Calculate the r-squared metric
print("R_squared :", metrics.r2_score(gen_y_train, y_train))

Trainnig :
RMSE : 372883.4485103718
R_squared : 0.9950190675520957


In [22]:
# Generate values of y from x, using the linear model
gen_y_test = regr_tree.predict(X_test)

In [23]:
print("Testing :")
# Calculate the root mean-squared-error
print("RMSE :", np.sqrt(metrics.mean_squared_error(gen_y_test, y_test)))
# Calculate the r-squared metric
print("R_squared :", metrics.r2_score(gen_y_test, y_test))

Testing :
RMSE : 541571.2794681438
R_squared : 0.9860741434501291


### Putting it all together

In [24]:
def model_evaluation(df, model, features, ts=0.2, rs=25):
    """Evaluates a regression model on a given dataset.

    This function performs the following steps:
    1. Splits the dataset into training and testing sets for each feature.
    2. Fits the provided model to the training data.
    3. Generates predictions on the testing set.
    4. Calculates evaluation metrics (RMSE, R-squared, MAE, MSE) for each feature.
    5. Returns a pandas DataFrame containing the evaluation results.

    Args:
        df (pandas.DataFrame): The input dataset containing features and target variables.
        model (sklearn model object): The trained regression model to evaluate.
        features (list): A list of feature names to evaluate.
        ts (float, optional): The test size for train-test splitting. Defaults to 0.2.
        rs (int, optional): The random state for train-test splitting. Defaults to 25.

    Returns:
        pandas.DataFrame: A DataFrame containing evaluation metrics for each feature.
    """

    data = []

    for feature in features:
        data_block = []

        # Splitting the dataset into feature and response variables
        X = df['year']
        y = df[feature]

        # Reshape the X_train data to fit into the regression model
        X_reshaped = np.array(X).reshape(-1, 1)

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=ts, random_state=rs)

        # Fit the model to the training data
        model.fit(X_train, y_train)

        # Generate predictions on the testing set
        gen_y = model.predict(X_test)

        # Calculate evaluation metrics
        rmse = float(np.sqrt(metrics.mean_squared_error(gen_y, y_test)))
        r2 = float(metrics.r2_score(gen_y, y_test))
        mae = float(metrics.mean_absolute_error(gen_y, y_test))
        mse = float(metrics.mean_squared_error(gen_y, y_test))

        data_block.append(feature)
        data_block.append(rmse)
        data_block.append(r2)
        data_block.append(mae)
        data_block.append(mse)

        data.append(data_block)

    # Create a pandas DataFrame to store the results
    model_evaluation_df = pd.DataFrame(data, columns=['category', 'rmse', 'r_square', 'mae', 'mse'])

    return model_evaluation_df

In [25]:
# Instantiate a linear regression model
linear_regressor = LinearRegression()

In [26]:
# Call the model_evaluation function and get results
df_linear_model = model_evaluation(df, linear_regressor, ['male', 'female', 'total'])
df_linear_model

Unnamed: 0,category,rmse,r_square,mae,mse
0,male,2506329.0,0.421895,2253193.0,6281686000000.0
1,female,2514944.0,0.231327,2259832.0,6324941000000.0
2,total,5020186.0,0.336882,4513025.0,25202260000000.0


In [27]:
# # Instantiate a regression tree model
decision_tree_regressor = DecisionTreeRegressor(max_depth=2,random_state=42)

In [28]:
# Call the model_evaluation function and get results
df_decision_tree_model = model_evaluation(df, decision_tree_regressor, ['male', 'female', 'total'])
df_decision_tree_model

Unnamed: 0,category,rmse,r_square,mae,mse
0,male,541571.279468,0.986074,495947.59596,293299500000.0
1,female,291898.253863,0.995428,250746.121901,85204590000.0
2,total,865957.9725,0.990482,794151.278237,749883200000.0
