In [None]:
# Necessary cells needed for analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score,f1_score
from sklearn.preprocessing import LabelEncoder

Data Validation

This data has 1500 rows and 8 columns.I have validated all variables and made some changes wherever necesary.
All the columns are:
owned : numeric - 1 or not (0) - two possible outcomes,
make_model : character - 6 possible values,
review_month : character from Jan to Dec,
web_browser : character - 7 categories,
reviewer_age : Numeric - from 16 to so on ,
primary_use : Character - two categories,
value_for_money : rating from 1 to 10,
overall_rating : continuous values from  0 to 25

In [None]:

df=pd.read_csv("https://s3.amazonaws.com/talent-assets.datacamp.com/electric_bike_ratings_2212.csv")

df.head()

In [None]:
df.info()

In [None]:
# Total number of missing values
df.isnull().sum()

In [None]:
# validate two possible values
df['owned'].unique()

In [None]:
# validate 6 types of models
df['make_model'].nunique()

In [None]:
# validate month  from Jan to Dec
df['review_month'].unique()

In [None]:
# validate 7 types of browser
df['web_browser'].unique()

In [None]:
# validate age from 16
df['reviewer_age'].unique()

In [None]:
# replace '-' variable to avg age in reviewer age column

# Calculate average age for non-null values in the column
avg_age = df.loc[df['reviewer_age'] != '-', 'reviewer_age'].astype(int).mean()

# Replace '-' with average age
df['reviewer_age'] = df['reviewer_age'].replace('-', avg_age)

# convert it back to int
df['reviewer_age'] = df['reviewer_age'].astype(int)                             

In [None]:
# validate two user reports
df['primary_use'].unique()

In [None]:
# validate ratings from 1 to 10
df['value_for_money'].unique()

In [None]:
# validate total rating score from 0 to 25
df['overall_rating'].unique()


In [None]:
# Replace the missing value with "unknown"
df['web_browser'].fillna("unknown",inplace = True)

In [None]:
# Check again if there is any missing value
df.isnull().sum()


In [None]:
# validate any negative values in numeric variables
df.describe()

Exploratory analysis

Target Variable - owned

Since,the product team wants to extend the survey. But, they want to be sure they can predict whether the ratings came from owners or non-owners.
we use a  barplot visualization to show the number of reviews from owners and non-owners: 
a->From the visualization, it is clear that the category of the variable "owned" with value 1 (or owners) has the most number of observations.
b->The observations are not balanced across categories of the variable "owned". The number of observations for owners is more than non-owners.

In [None]:
sns.countplot(x='owned', data=df)
plt.show()

Looking the histplot below, we can see most of the higher ratings are between 18 to 20.

In [None]:
# For distribution of overall rating, we use histogram visualization

df['overall_rating'].hist()
plt.xlabel('Overall Rating')
plt.ylabel('Count')
plt.title('Distribution of Overall Rating')
plt.show()


By looking at below, ownership has higher overallrating than the non ownership.
And same goes in web browser and  in primary use,Most of the rater are owner.

In [None]:
# We use a boxplot visualization to show the relationship between ownership and overall rating:

sns.boxplot(x='owned', y='overall_rating', data=df)
plt.show()


In [None]:
# realtionship between ownership and overall rating in websites and acoording to primary uses

fig, axes = plt.subplots(1,2,figsize=(25,6))

sns.boxplot(x='owned', y='overall_rating',hue ='web_browser',data = df,ax=axes[0]).set(title='overall rating in web_browser')
sns.boxplot(x ='owned',y ='overall_rating',data =df,hue='primary_use',ax=axes[1]).set(title='over all rating according primary use ')
plt.show()

Model Fitting

The business wants to predict whether a review came from an owner or not using the data provided and it is classification tasks
so i will use Logistic Regression, it is easy to train and fast to predict and it gives an estimate of the probability of the target variable for each input.
For the comparison model i will use Random Forest Classifier ,It can handle missing data and categorical variables, and it is robust to outliers and noisy data. It also gives an estimate of the importance of each feature, which is useful for understanding the dataset.


Prepare Data for Modelling

To enable modelling, we chose make_model, review_month,web_browser,reviewer_age,primary_use, value_for_money as features, owned as target variables.
I also have made the following changes:

Convert the categorical variables into numeric features and
Split the data into a training set and a test set

In [None]:
# Split dataset into 80% training set and 20% test set
X = df[['make_model', 'review_month', 'web_browser', 'reviewer_age', 'primary_use',
        'value_for_money']]
y = df['owned']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# converting into numerical variable
encoder = LabelEncoder()
df['make_model'] = encoder.fit_transform(df['make_model'])
df['review_month'] = encoder.fit_transform(df['review_month'])
df['web_browser'] = encoder.fit_transform(df['web_browser'])
df['primary_use'] = encoder.fit_transform(df['primary_use'])
df['value_for_money'] = encoder.fit_transform(df['value_for_money'])


Logistic Regeression

In [None]:
log_reg = LogisticRegression( C = 0.1 , penalty = 'l2')
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
# Calculate the f1_score
f1 = f1_score(y_test, y_pred_log_reg)
print("F1 Score:", f1)
# Calculate the precision_score
precision = precision_score(y_test, y_pred_log_reg)
print("Precision Score:", precision) 

Finding the feature importance

In [None]:
# Train the logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Get the feature importances
coefs = log_reg.coef_[0]

# Create a list of feature names
feature_names = X.columns

# Create a dataframe of feature importances
feature_importances = pd.DataFrame({'feature': feature_names, 'coef': coefs})

# Sort the dataframe by feature importance
feature_importances.sort_values(by='coef', ascending=False, inplace=True)

# Print the feature importances
print(feature_importances)


In [None]:
# Plot the feature importances
plt.bar(feature_importances['feature'], feature_importances['coef'])
plt.xlabel('Feature')
plt.ylabel('Coefficient')
plt.title('Feature Importances')
plt.xticks(rotation=90)
plt.show()

Finding the best parameters

In [None]:
# Define the parameter grid
param_grid = {'C': [0.1, 1, 10],
              'penalty': ['l1', 'l2']}

# Create the grid search object
grid_search = GridSearchCV(log_reg, param_grid, cv=5, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)


In [None]:
# visulizing it 
# Get the results of the grid search
results = grid_search.cv_results_

# Extract the mean test score for each combination of parameters
scores = results['mean_test_score'].reshape(len(param_grid['C']), len(param_grid['penalty']))

# Create a heatmap of the test scores
plt.imshow(scores, cmap='gray', interpolation='nearest')
plt.colorbar()
plt.xlabel('Penalty')
plt.ylabel('C')
plt.xticks(np.arange(len(param_grid['penalty'])), param_grid['penalty'])
plt.yticks(np.arange(len(param_grid['C'])), param_grid['C'])
plt.title('Accuracy of Logistic Regression')
plt.show()

Random Forest Classifier

In [None]:

rnd_clf = RandomForestClassifier(n_estimators = 200 , max_depth = 5)
rnd_clf.fit(X_train, y_train)

y_pred_rnd_clf = rnd_clf.predict(X_test)

f1 = f1_score(y_test, y_pred_rnd_clf)
print("F1 Score:", f1)
# Calculate the precision_score
precision = precision_score(y_test, y_pred_rnd_clf)
print("Precision Score:", precision) 

Finding the feature importance

In [None]:
# Train the random forest classifier
rnd_clf = RandomForestClassifier()
rnd_clf.fit(X_train, y_train)

# Get the feature importances
importances = rnd_clf.feature_importances_

# Create a list of feature names
feature_names = X.columns

# Create a dataframe of feature importances
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importances})

# Sort the dataframe by feature importance
feature_importances.sort_values(by='importance', ascending=False, inplace=True)

# Print the feature importances
print(feature_importances)

In [None]:
# Plot the feature importances
plt.bar(feature_importances['feature'], feature_importances['importance'])
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Feature Importances')
plt.xticks(rotation=90)
plt.show()

Finding the best parameter

In [None]:
# Define the parameter grid
param_grid = {'n_estimators': [100, 200, 300],
              'max_depth': [5, 10, 15] }

# Create the grid search object
grid_search = GridSearchCV(rnd_clf, param_grid, cv=5, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)


In [None]:
# visulizing it

# Extract the results of the grid search
results = grid_search.cv_results_

# Extract the mean test scores
mean_test_scores = results['mean_test_score']

# Extract the standard deviation of the test scores
std_test_scores = results['std_test_score']

# Extract the parameters that were tested
params = results['params']

# Plot the mean test scores
plt.errorbar(range(len(params)), mean_test_scores, yerr=std_test_scores)
plt.xlabel('Parameter Combination')
plt.ylabel('Mean Test Score')
plt.show()


This above  plot  shows the mean test score for each parameter combination tested in the grid search, along with the standard deviation of the test scores. The x-axis shows the index of the parameter combination, and the y-axis shows the mean test score.

why i choose them to be my evaluation?

the precision_score metric  focus on the model's ability to correctly predict the positive class, specifically minimizing the number of false positives.
It ranges between 0 and 1, where 1 represents a perfect score and 0 represents a poor score.Precision is a measure of how many of the positive predictions were actually correct.

f1_score metric balance precision and recall and get a single number that
represents the overall performance of the model.
It ranges between 0 and 1, where 1 represents a perfect score and 0 represents a poor score.
 F1 score is a better measure than accuracy, especially if you have 
 an uneven class distribution.

The f1_score of the Logistic Regression model and Decision Tree model is  0.76 and 0.80, meaning that Random Forest Classification model predicts more correctly than Logistic Regression 
And Precision_score of Logistic Regression and Random Forest Classification is 0.71 and 0.75,meaning that Random Forest Classification is considered better performing model.

From this metric, we can conclude that the Random Forest classification  model has a higher chance of performing better.