In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import statsmodels.api as sm
from mpl_toolkits.mplot3d import Axes3D

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# Assuming a simple linear relationship between Age and Salary for the purpose of data generation
np.random.seed(0)
ages = np.random.randint(20, 60, size=95)
education = np.random.randint(2, 6, size=95)
salaries = 50 + (ages - 20) * 0.5 + + 0.9 * education + np.random.normal(0, 3, size=95)

# Creating a DataFrame for the additional data
df = pd.DataFrame({'Age': ages, 'Education': education, 'Salary': salaries})
df.describe()

One-Sample T-Test

In [None]:
# Hypothesis testing: One-sample T-test
# Null hypothesis: Mean is 2.0
t_statistic, p_value = stats.ttest_1samp(df['Age'], 20)

print("T-statistic:", t_statistic)
print("p-value:", p_value)

Two-sample T-test

In [None]:
t_statistic, p_value = stats.ttest_ind(df['Age'], df['Salary'])  # whether significant difference between the means of two independent samples
print("T-statistic:", t_statistic)
print("p-value:", p_value)

In [None]:
# Calculate Pearson correlation coefficient
correlation_coef = np.corrcoef(df['Age'], df['Salary'])
correlation_coef

Linear Regression

In [None]:
# Initialize the linear regression model
X = df[['Age', 'Education']]  # Features (2D array for scikit-learn)
y = df['Salary']  # Target variable

model = LinearRegression()
model.fit(X, y)

In [None]:
# Accessing and printing the model's parameters
print("Model Coefficients (Slope):", model.coef_)
print("Model Intercept:", model.intercept_)

In [None]:
# Perform 3-fold cross-validation
cv_scores = cross_val_score(model, X, y, cv=3)

print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))  # larger, better

Make Predictions

In [None]:
# Predictions
y_pred = model.predict(X)
y_pred

Evaluate the Model

In [None]:
r2 = r2_score(y, y_pred)
print(f'R-squared: {r2}')

In [None]:
mse = mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
print(f'Mean Squared Error: {mse}') # larger, worse
print(f'Mean Absolute Error: {mae}') # larger, worse

Visualization (Actual vs Predicted Values)

In [None]:
plt.scatter(X['Age'], y, color='blue', label='Actual')
plt.title('Actual vs Predicted Values')
plt.xlabel('Age')
plt.ylabel('Salary')
plt.legend()
plt.show()

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Assuming 'Age' and 'Education' are your independent variables and 'Salary' is dependent
ax.scatter(df['Age'], df['Education'], y, color='blue', label='Actual')
ax.set_xlabel('Age')
ax.set_ylabel('Education')
ax.set_zlabel('Salary')
plt.show()

In [None]:
sns.pairplot(df[['Age', 'Education', 'Salary']])  # Include all relevant columns

Linear Regression2

In [None]:
# Add a constant to the independent variable
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Get the summary of the regression
model.summary()