In [92]:
import pandas as pd
import numpy as np
# Import the LinearRegression class from scikit-learn's linear_model module
from sklearn.linear_model import LinearRegression
# Import the split function from sklearn
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns
from sklearn import metrics

In [103]:
# Load your dataset
df = pd.read_csv('12411-0003_$F.csv')

In [104]:
# View the first five rows 
df.head()

Unnamed: 0,date,female,male,total
0,31.12.1970,29071621,31929543,61001164
1,31.12.1971,29367427,32135076,61502503
2,31.12.1972,29533254,32276124,61809378
3,31.12.1973,29713753,32387616,62101369
4,31.12.1974,29604450,32387025,61991475


## Data Cleaning

In [105]:
# Extracting just the year column from the date
df['year'] = df['date'].apply(lambda x:x[-4:])

# Rearrange the columns you need for the machine learning
df = df[['year', 'female', 'male', 'total']]
df.head()

Unnamed: 0,year,female,male,total
0,1970,29071621,31929543,61001164
1,1971,29367427,32135076,61502503
2,1972,29533254,32276124,61809378
3,1973,29713753,32387616,62101369
4,1974,29604450,32387025,61991475


In [106]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   year    54 non-null     object
 1   female  54 non-null     int64 
 2   male    54 non-null     int64 
 3   total   54 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 1.8+ KB


## Machine Learning

In [107]:
# Initialise the LinearRegression model
lm = LinearRegression()

In [108]:
# Split your data into Response and Feature variables 
X = df['year'] # feature
y = df['female'] # response

In [100]:
# Confirming the shape of the Response and Feature Varriables
print(f'The shape of the response (y) variable is: {y.shape}\n' +
        f'The shape of the feature (X) variable is: {X.shape}')

The shape of the response (y) variable is: (54,)
The shape of the feature (X) variable is: (54,)


In [109]:
# Reshaping the X_train data to fit into the regression model
X_reshaped = np.array(X).reshape(-1,1)

In [110]:
# Call the 'train_test_split' function:
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=25)

In [111]:
# Fit the model to the training data 
lm.fit(X_train_reshaped, y_train)

In [112]:
# Extract the intercept, or y-cut, of our linear model
c = float(lm.intercept_)
# Extract the coefficient, or gradient, of our linear model
m = lm.coef_[0]

In [113]:
# Displaying the Slope and Intercept on the regression model
print(f"Slope:\t\t {m}\nIntercept:\t {c}")

Slope:		 281839.68433727697
Intercept:	 -526718974.4684461


## Model Evaluation

In [114]:
# Generate the values that fall along our regression line
gen_y = lm.predict(X_train)

In [115]:
print("Training:")
# Calculate the mean-squared-error
print('MSE:', metrics.mean_squared_error(y_train, gen_y))
# Calculate the R-squared metric
print('R_squared:', metrics.r2_score(y_train, gen_y))

Training:
MSE: 6629603828965.309
R_squared: 0.7636834195196048


In [116]:
# Generate values of y from x, using the linear model
gen_y_test = lm.predict(X_test)

In [118]:
print("Testing:")
# Calculate the mean-squared-error
print('MSE:', metrics.mean_squared_error(y_test, gen_y_test))
# Calculate the R-squared metric
print('R_squared:', metrics.r2_score(y_test, gen_y_test))

Testing:
MSE: 6281686384721.853
R_squared: 0.7234499424778538
