In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
df = pd.read_csv("earthquake_1995-2023.csv")
df

In [None]:
df['tsunami'] = df['tsunami'].replace({1: True, 0: False})

In [None]:
data = df[['title','alert','tsunami','country','magnitude','nst','mmi','sig','depth']]
data    

In [None]:
cols = ['magnitude','nst','mmi','sig','depth']

Q1 = data[cols].quantile(0.1) # Same as np.percentile but maps (0,1) and not (0,100)
Q3 = data[cols].quantile(0.9)
IQR = Q3 - Q1

condition = ~((data[cols] < (Q1 - 1.5 * IQR)) | (data[cols] > (Q3 + 1.5 * IQR))).any(axis=1)

data_filtered = data[condition]

In [None]:
data_filtered.isnull().sum()
data_cleaned = data_filtered.dropna()
data_cleaned.isnull().sum()
data_sampled = data_cleaned

In [None]:
fig_hist, (ax_magnitude, ax_nst, ax_mmi) = plt.subplots(1, 3, figsize=(18, 6), dpi=100)

# Define the data and titles
plot_data = [('magnitude', ax_magnitude, "Magnitude"),
             ('nst', ax_nst, "NST"),
             ('mmi', ax_mmi, "MMI")]

# Plotting and setting titles in a loop
for data_col, axis, title in plot_data:
    sns.histplot(data=data_sampled, x=data_col, ax=axis)
    axis.set_title(title)

plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18, 6), dpi=100)

sns.histplot(data = data_sampled, x = 'sig', ax=axes[0])
axes[0].set_title("SIG")

sns.histplot(data = data_sampled, x = 'depth', ax=axes[1])
axes[1].set_title("Depth")

In [None]:
sns.boxplot(data=data_sampled,x='tsunami',y = 'magnitude')
plt.xticks(rotation=45,ha="right");

In [None]:
sns.boxplot(data=data_sampled,x='alert',y = 'depth')
plt.xticks(rotation=45,ha="right");

In [None]:
plt.figure(figsize=(15, 10))
sns.heatmap(data_sampled[['magnitude','nst','mmi','sig','depth']].corr(), annot=True, linecolor='black', cmap='magma')
plt.show()

In [None]:
X=np.array(data_sampled.loc[:,'sig'].values.reshape(-1, 1)) 
Y=np.array(data_sampled.loc[:,'magnitude'].values.reshape(-1, 1)) 

plt.scatter(X, Y)
plt.grid()
plt.xlabel("sig")
plt.ylabel("magnitude")

In [None]:
linreg = LinearRegression()

linreg.fit(X, Y) 

print('a=',linreg.coef_[0][0])  
print('b=',linreg.intercept_[0]) 

Y_hat=linreg.predict(X) 

In [None]:
erro=Y_hat-Y
df_estimado=pd.DataFrame(np.concatenate((X,Y,Y_hat,erro),axis=1), columns=['X','Y','Y_hat','Erro (e)'])
df_estimado


In [None]:
plt.scatter(X, Y, label='Valor Real')
plt.scatter(X, Y_hat,color='red',  label='Valor Predito')
plt.grid()
plt.legend(loc='upper right')

plt.xlabel("Faltas por Semestre (X)")
plt.ylabel("Nota Final")

In [None]:
plt.scatter(X, Y)
plt.plot(X, Y_hat,color='red')
plt.grid()
plt.xlabel("Faltas por Semestre")
plt.ylabel("Nota Final")

In [None]:
print("MSE=",mean_squared_error(Y, Y_hat),", R^2=", r2_score(Y,Y_hat))  