In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Project Goals
### The purpose of this analysis is: 

- Create a linear regression model that predicts the outcome for a tennis player based on their playing habits. 
- Determine which features help to predict the target value of winnings in USD.




First, let's explore the dataset:

In [None]:
df = pd.read_csv('tennis_stats.csv')
display(df.head())   # Use display instead of print to show an HTML table
display(df.info())   # info() to find non-null values and data types of each column

There is not missing data.
Let's proceed with EDA:

In [None]:
# Plot a graphical correlation matrix for each pair of columns in the dataframe
corr = df.corr() # data frame correlation function
# Let's check out the correlation among features for our target outcome Winnings:
winnings = pd.DataFrame(round(corr['Winnings'], 3).drop('Winnings', axis = 0).reset_index())
# Remove the Winnings category itself and sort those features with more importance
winnings = winnings.sort_values(by = "Winnings", ascending = False)
winnings

In [None]:
# Now let's plot it
plt.bar(winnings['index'], winnings.Winnings)
plt.xticks(rotation = 90)
plt.show()

we will plot the features with a strong correlation (>0.80):
- Wins	*0.91*
- ServiceGamesPlayed	0.91
- ReturnGamesPlayed	0.91
- BreakPointsOpportunities	0.90
- BreakPointsFaced	0.88
- Losses	0.87
- DoubleFaults	0.85
- Aces 0.80

In [None]:
top_features = ['Wins', 
                'ServiceGamesPlayed', 
                'ReturnGamesPlayed',
                'BreakPointsOpportunities', 
                'BreakPointsFaced', 
                'Losses', 'DoubleFaults', 'Aces']
df_top = df[top_features]
df_target = df[['Winnings']]

Scatter plot creation with the features with a correlation >= 0.80

In [None]:
# perform exploratory analysis here:
for feature in df_top.columns:
    plt.figure(figsize = (5, 4))
    plt.scatter(x = df_top[feature], y = df_target, alpha = 0.5)
    plt.title(f"{feature} vs Winnings")
    plt.show()

# Regression model
Let's use one feature from the dataset to build a single feature linear regression model on the data. The model, at this point, should use only one feature and predict the Winnngs($USD) columns. Before training the model, split we will split data into training and test datasets so that you we may evaluate our model on the test set.

In [None]:
feature = 'ServiceGamesPlayed'
X = np.array(df[feature])
X = X.reshape(-1, 1)
y = df_target.copy()

# Setting the test size as 80% and creating a seed for repeatable results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8,
random_state = 27)

reg_all = LinearRegression()        #Creating the model
reg_all.fit(X_train, y_train)       #Fitting the model to our training data
y_pred = reg_all.predict(X_test)    #Predicting the y values using our test data

### How does our model perform? 

In [None]:
reg_all.score(X_test, y_test)

Plotting our model’s predictions on the test set against the actual outcome variable to visualize the performance.

In [None]:
# plot predictions against actual winnings
plt.scatter(x = y_pred, y = y_test, alpha = 0.5)
plt.xlabel('Actual Winnings')
plt.ylabel(feature)
plt.show()

### Plotting 

In [None]:
plt.scatter(X_test, y_test, alpha = 0.5)
plt.plot(X_test, y_pred, color = 'red')
plt.xlabel(feature)
plt.ylabel('Actual Winnings ($USD)')
plt.show()

Let's create a few more linear regression models that use one feature to predict one of the outcomes. Which model that you create is the best?

#### Feature - 'Wins'

In [None]:
feature = 'Wins'
X = np.array(df[feature])
X = X.reshape(-1, 1)
y = df_target.copy()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8,
random_state = 27)

reg_all = LinearRegression()
reg_all.fit(X_train, y_train)
y_pred = reg_all.predict(X_test)
reg_all.score(X_test, y_test)

In [None]:
plt.scatter(X_test, y_test, alpha = 0.5)
plt.plot(X_test, y_pred, color = 'red')
plt.xlabel(feature)
plt.ylabel('Actual Winnings ($USD)')
plt.show()

#### Feature - 'ReturnGamesPlayed'

In [None]:
feature = 'ReturnGamesPlayed'
X = np.array(df[feature])
X = X.reshape(-1, 1)
y = df_target.copy()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8,
random_state = 27)

reg_all = LinearRegression()
reg_all.fit(X_train, y_train)
y_pred = reg_all.predict(X_test)
reg_all.score(X_test, y_test)

In [None]:
plt.scatter(X_test, y_test, alpha = 0.5)
plt.plot(X_test, y_pred, color = 'red')
plt.xlabel(feature)
plt.ylabel('Actual Winnings ($USD)')
plt.show()

#### Feature - 'BreakPointsOpportunities'

In [None]:
feature = 'BreakPointsOpportunities'
X = np.array(df[feature])
X = X.reshape(-1, 1)
y = df_target.copy()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8,
random_state = 27)

reg_all = LinearRegression()
reg_all.fit(X_train, y_train)
y_pred = reg_all.predict(X_test)
reg_all.score(X_test, y_test)

In [None]:
plt.scatter(X_test, y_test, alpha = 0.5)
plt.plot(X_test, y_pred, color = 'red')
plt.xlabel(feature)
plt.ylabel('Actual Winnings ($USD)')
plt.show()

Now, let's create a few linear regression models that use two features to predict yearly earnings. Which set of two features results in the best model?

In [None]:
feature = ['ServiceGamesPlayed', 'ReturnGamesPlayed']
X = np.array(df[feature])
#X = X.reshape(-1, 1)
y = df_target.copy()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8,
random_state = 27)

reg_all = LinearRegression()
reg_all.fit(X_train, y_train)
y_pred = reg_all.predict(X_test)
reg_all.score(X_test, y_test)

In [None]:
## perform two feature linear regressions here:
feature = ['BreakPointsOpportunities', 'ReturnGamesPlayed']
X = np.array(df[feature])
#X = X.reshape(-1, 1)
y = df_target.copy()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8,
random_state = 27)

reg_all = LinearRegression()
reg_all.fit(X_train, y_train)
y_pred = reg_all.predict(X_test)
reg_all.score(X_test, y_test)

In [None]:
feature = ['ServiceGamesPlayed', 'BreakPointsOpportunities']
X = np.array(df[feature])
#X = X.reshape(-1, 1)
y = df_target.copy()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8,
random_state = 27)

reg_all = LinearRegression()
reg_all.fit(X_train, y_train)
y_pred = reg_all.predict(X_test)
reg_all.score(X_test, y_test)

### Finally, we will create a multiple feature linear regression using our eight top features:

In [None]:
## perform multiple feature linear regressions here:
feature = ['Wins', 
                'ServiceGamesPlayed', 
                'ReturnGamesPlayed',
                'BreakPointsOpportunities', 
                'BreakPointsFaced', 
                'Losses', 'DoubleFaults', 'Aces']
X = np.array(df[feature])
#X = X.reshape(-1, 1)
y = df_target.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8,
random_state = 27)

reg_all = LinearRegression()
reg_all.fit(X_train, y_train)
y_pred = reg_all.predict(X_test)
reg_all.score(X_test, y_test)

In [None]:
plt.scatter(y_test, y_pred, alpha = 0.5, color = 'green')
plt.title('Predicted Winnings vs. Actual Winnings - Multiple Features')
plt.xlabel('Actual Winnings')
plt.ylabel('Predicted Winnings')
plt.show()
plt.clf()