In [3]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Read in data from NASA exoplenet archive 
df = pd.read_csv(r"C:\Users\julia\Downloads\csv_main.csv", skiprows=78)

In [6]:
# Linear Regression Analysis
"""
Performs linear regression analysis between two variables in a DataFrame.

Parameters:
df (DataFrame): The DataFrame containing the data.
predictor_var: The name of the column to be used as the predictor variable.
response_var: The name of the column to be used as the response variable.
"""

def linear_regression_analysis(df, predictor_var, response_var):
    # Filtering out Nan values
    df_filtered = df.dropna(subset=[predictor_var, response_var])
    X = df_filtered[[predictor_var]]
    y = df_filtered[response_var] 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Print results
    print(f"Linear Regression Analysis for {predictor_var} vs {response_var}")
    print("Mean Squared Error:", mse)
    print("R-squared:", r2)

    # Plot results
    plt.scatter(X_test, y_test, color='blue', label='Actual')
    plt.plot(X_test, y_pred, color='red', label='Predicted')
    plt.title(f'Linear Regression: {predictor_var} vs {response_var}')
    plt.xlabel(predictor_var)
    plt.ylabel(response_var)
    plt.legend()
    plt.show()
    
# Polynomial Regression Analysis
"""
Performs polynomial regression analysis between two variables in a DataFrame.

Parameters:
df (DataFrame): The DataFrame containing the data.
predictor_var: The name of the column to be used as the predictor variable.
response_var: The name of the column to be used as the response variable.
degree: The degree of the polynomial regression.
x_label: Custom label for the x-axis.
y_label: Custom label for the y-axis.
"""

def polynomial_regression_analysis(df, predictor_var, response_var, degree, x_label, y_label):
    # Filtering out rows with NaN values
    df_filtered = df.dropna(subset=[predictor_var, response_var])
    X = df_filtered[[predictor_var]] 
    y = df_filtered[response_var] 
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    poly = PolynomialFeatures(degree)
    X_poly = poly.fit_transform(X_train)
    model_poly = LinearRegression()
    model_poly.fit(X_poly, y_train)
    X_test_poly = poly.transform(X_test)
    y_pred_poly = model_poly.predict(X_test_poly)
    mse_poly = mean_squared_error(y_test, y_pred_poly)
    r2_poly = r2_score(y_test, y_pred_poly)
    
    # Print results
    print(f"Polynomial Regression (Degree = {degree}):")
    print("Mean Squared Error:", mse_poly)
    print("R-squared:", r2_poly)

    sorted_indices = np.argsort(X_test.iloc[:, 0])
    X_test_sorted = X_test.iloc[sorted_indices, 0]
    y_test_sorted = y_test.iloc[sorted_indices]
    y_pred_sorted = y_pred_poly[sorted_indices]
    
    # Plot results
    plt.scatter(X_test_sorted, y_test_sorted, color='blue', label='Actual')
    plt.plot(X_test_sorted, y_pred_sorted, color='red', label='Predicted')
    plt.title(f'Polynomial Regression (Degree = {degree}): {x_label} vs {y_label}')
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend()
    plt.show()