In [87]:
##-----Part B-----## 
##Predictive modelling##

In [88]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression as SkLR






In [89]:
print("scikit-learn version:", sklearn.__version__)

print("Using:", SkLR.__module__)

scikit-learn version: 1.7.2
Using: sklearn.linear_model._base


In [90]:
df=pd.read_csv("C:/Users/aravi/Downloads/zomato_df_final_data.csv")
print(f"Dataset dimensions: {df.shape}")
print("\nFirst 5 rows:")
display(df.head())


Dataset dimensions: (10500, 17)

First 5 rows:


Unnamed: 0,address,cost,cuisine,lat,link,lng,phone,rating_number,rating_text,subzone,title,type,votes,groupon,color,cost_2,cuisine_color
0,"371A Pitt Street, CBD, Sydney",50.0,"['Hot Pot', 'Korean BBQ', 'BBQ', 'Korean']",-33.876059,https://www.zomato.com/sydney/sydney-madang-cbd,151.207605,02 8318 0406,4.0,Very Good,CBD,Sydney Madang,['Casual Dining'],1311.0,False,#e15307,5.243902,#6f706b
1,"Shop 7A, 2 Huntley Street, Alexandria, Sydney",80.0,"['Cafe', 'Coffee and Tea', 'Salad', 'Poké']",-33.910999,https://www.zomato.com/sydney/the-grounds-of-a...,151.193793,02 9699 2225,4.6,Excellent,"The Grounds of Alexandria, Alexandria",The Grounds of Alexandria Cafe,['Café'],3236.0,False,#9c3203,7.560976,#6f706b
2,"Level G, The Darling at the Star, 80 Pyrmont ...",120.0,['Japanese'],-33.867971,https://www.zomato.com/sydney/sokyo-pyrmont,151.19521,1800 700 700,4.9,Excellent,"The Star, Pyrmont",Sokyo,['Fine Dining'],1227.0,False,#7f2704,10.650407,#6f706b
3,"Sydney Opera House, Bennelong Point, Circular...",270.0,['Modern Australian'],-33.856784,https://www.zomato.com/sydney/bennelong-restau...,151.215297,02 9240 8000,4.9,Excellent,Circular Quay,Bennelong Restaurant,"['Fine Dining', 'Bar']",278.0,False,#7f2704,22.235772,#4186f4
4,"20 Campbell Street, Chinatown, Sydney",55.0,"['Thai', 'Salad']",-33.879035,https://www.zomato.com/sydney/chat-thai-chinatown,151.206409,02 8317 4811,4.5,Excellent,Chinatown,Chat Thai,['Casual Dining'],2150.0,False,#a83703,5.630081,#6f706b


In [91]:
print(df.columns.tolist())

['address', 'cost', 'cuisine', 'lat', 'link', 'lng', 'phone', 'rating_number', 'rating_text', 'subzone', 'title', 'type', 'votes', 'groupon', 'color', 'cost_2', 'cuisine_color']


In [92]:
print("\nData types:")
print(df.dtypes)


Data types:
address           object
cost             float64
cuisine           object
lat              float64
link              object
lng              float64
phone             object
rating_number    float64
rating_text       object
subzone           object
title             object
type              object
votes            float64
groupon             bool
color             object
cost_2           float64
cuisine_color     object
dtype: object


In [93]:
#Question 1

In [94]:
##1.Handling missing values
print("\nMissing values:")
print(df.isnull().sum())


Missing values:
address             0
cost              346
cuisine             0
lat               192
link                0
lng               192
phone               0
rating_number    3316
rating_text      3316
subzone             0
title               0
type               48
votes            3316
groupon             0
color               0
cost_2            346
cuisine_color       0
dtype: int64


In [95]:
#Now we are going to handle the missing the data here

In [96]:
#we impute numeric columns with -1 and categorical values with 'unknown'
for col in df.columns:
     # categorical values
    if df[col].dtype == "object":  
        df[col] = df[col].fillna("Unknown")
    else:            
        # numeric values               
        df[col] = df[col].fillna(-1)

In [97]:
#Now we are dropping values by detecting outliers using the IQR method
Q1 = df['cost'].quantile(0.25)
Q3 = df['cost'].quantile(0.75)
IQR = Q3 - Q1

outliers = df[(df['cost'] < (Q1 - 1.5 * IQR)) | (df['cost'] > (Q3 + 1.5 * IQR))]
print(f"Number of outliers in cost: {len(outliers)}")

Number of outliers in cost: 361


In [98]:
#since we have these many outliers we need to drop them
df = df[~((df['cost'] < (Q1 - 1.5 * IQR)) | (df['cost'] > (Q3 + 1.5 * IQR)))]
print("Dataset after removing outliers:", df.shape)

Dataset after removing outliers: (10139, 17)


In [99]:
##Encoding categorical features##

In [100]:
#label encoding

In [101]:
le = LabelEncoder()

In [102]:
df['rating_text_encoded'] = le.fit_transform(df['rating_text'])
print(dict(zip(le.classes_, le.transform(le.classes_))))

{'Average': np.int64(0), 'Excellent': np.int64(1), 'Good': np.int64(2), 'Poor': np.int64(3), 'Unknown': np.int64(4), 'Very Good': np.int64(5)}


In [103]:
##creating useful features##
##some of the useful features includes, cuisine diversity,cost,votes,ratings in number and texts, subzone etc

In [104]:
#analysing the number of cuisines, the restaurant serves
df['cuisine_count'] = df['cuisine'].apply(lambda x: len(str(x).split(',')))


In [105]:
print(df['cuisine_count'])

0        4
1        4
4        2
7        1
9        2
        ..
10495    1
10496    1
10497    1
10498    1
10499    1
Name: cuisine_count, Length: 10139, dtype: int64


In [106]:
##finding the number of people voted on the basis of voting
df['rating_strength'] = df['rating_number'] * np.log1p(df['votes'])


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [107]:
print(df['rating_strength'])

0        28.717232
1        37.179050
4        34.531597
7        33.368528
9        33.831289
           ...    
10495          inf
10496          inf
10497          inf
10498          inf
10499          inf
Name: rating_strength, Length: 10139, dtype: float64


In [108]:
#finding out the restaurants which are popular and not. 
df['is_popular'] = (df['votes'] > df['votes'].median()).astype(int)


In [109]:
print(df['is_popular'])
#if it shows 1 it is popular and if it show 0 its not popular

0        1
1        1
4        1
7        1
9        1
        ..
10495    0
10496    0
10497    0
10498    0
10499    0
Name: is_popular, Length: 10139, dtype: int64


In [110]:
##finding the cost bins
bins = [0, 50, 100, 200, 400, 800, 1600, float('inf')]
labels = ['Very Low', 'Low', 'Mid-Low', 'Mid', 'Mid-High', 'High', 'Luxury']
df['cost_bin'] = pd.cut(df['cost'], bins=bins, labels=labels, include_lowest=True)


In [111]:
df['cost_bin']

0        Very Low
1             Low
4             Low
7             Low
9             Low
           ...   
10495    Very Low
10496    Very Low
10497    Very Low
10498    Very Low
10499         Low
Name: cost_bin, Length: 10139, dtype: category
Categories (7, object): ['Very Low' < 'Low' < 'Mid-Low' < 'Mid' < 'Mid-High' < 'High' < 'Luxury']

In [112]:
##----2.Regression Models------##

In [113]:
#--Model A-- Linear Regression---#

##Here we are predicting rating_number(target variable) using Scikit-learn
##For this we are using features like cuisine_count, cost, votes etc

In [114]:
# make dummy variables for rating_text (if not already created)
if 'rating_text' in df.columns:
    dummies = pd.get_dummies(df['rating_text'], prefix='rating_text')
    df = pd.concat([df, dummies], axis=1)

In [115]:
# features chosen
feature_cols = ['cost','votes','cuisine_count',
                'rating_text_Excellent','rating_text_Good',
                'rating_text_Poor','rating_text_Very Good']

In [116]:
X = df[feature_cols]
y = df['rating_number']  

In [117]:
# Target variable
y = df['rating_number']

In [118]:
# Step 2: Train-test split
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [119]:
# Step 3: Train Linear Regression
# -------------------------
#model = LinearRegression()
#model.fit(X_train, y_train)

#here we are using sklr as linear regression beacause og the conflicts encountered due to pyspark
modelA = SkLR()
modelA.fit(X_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [120]:
#making predictions
y_prediction_modelA = modelA.predict(X_test)

In [121]:
print(y_prediction_modelA)

[1.38779001 0.66507916 1.02377636 ... 1.15018133 1.55904788 1.12279956]


In [122]:
#Calculating Mean squared error(MSE)
mse_modelA = mean_squared_error(y_test, y_prediction_modelA)

In [123]:
print('MSE value for Linear Regression using Scikit-learn is: ',mse_modelA)

MSE value for Linear Regression using Scikit-learn is:  2.908515983930785


In [124]:

#calculating rmse for linear regression
rmse_modelA = np.sqrt(mse_modelA)
print("RMSE for Linear Regression (Scikit-Learn):", rmse_modelA)


RMSE for Linear Regression (Scikit-Learn): 1.705437182639919


In [125]:
## Model B- Gradient Descent Regression - implementing linear regression with gradient descent##

In [126]:
#Here we are using the features implemented for model A as well#

X = df[['cost', 'votes', 'cuisine_count', 
        'rating_text_Excellent', 'rating_text_Good', 
        'rating_text_Poor', 'rating_text_Very Good']].values
y = df['rating_number'].values

In [127]:
 # implementing 80/20 split for train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [128]:
##implementing feature scaling needed for gradient descent
##for this we are doing scaler for generalising features needed for gradient descent.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [129]:

X_train_b = np.c_[np.ones((X_train_scaled.shape[0], 1)), X_train_scaled]
X_test_b = np.c_[np.ones((X_test_scaled.shape[0], 1)), X_test_scaled]

In [130]:
#applying gradient descent 

def gradient_descent(X, y, lr=0.01, n_iter=1000):
    m, n = X.shape
    theta = np.zeros(n)  # initialize weights
    for _ in range(n_iter):
        gradients = (1/m) * X.T.dot(X.dot(theta) - y)
        theta -= lr * gradients
    return theta

In [131]:
#training the model
theta = gradient_descent(X_train_b, y_train, lr=0.01, n_iter=1000)

In [132]:
print(theta)

[1.902359   0.39515636 0.2581802  0.08566422 0.06082824 0.73479889
 0.09925802 0.35617087]


In [133]:
#making predictions

y_prediction_modelB = X_test_b.dot(theta)


In [134]:
#calcuating Mean Squared Value(MSE)
mse_modelB = mean_squared_error(y_test, y_prediction_modelB)

In [135]:
print('MSE value for Gradient descent regression: ',mse_modelB)

MSE value for Gradient descent regression:  2.909461706211606


In [136]:
#calculating rmse for gradient descent
rmse_modelB = np.sqrt(mse_modelB)
print("RMSE for Gradient Descent Regression:", rmse_modelB)


RMSE for Gradient Descent Regression: 1.7057144269225157


In [137]:
### 3. CLASSIFICATION MODELS ###
# here we dont have rating _text column. Here we are are going to have 2 classes.
#class 1 = Poor + Average
#class 2 = Good + Very Good + Excellent

#we set class1 as the default class here
df['rating_binary'] = 1


In [138]:
#We set those having rating as goog or very good or excellent as class 2
df.loc[(df['rating_text_Good'] == 1) | 
       (df['rating_text_Very Good'] == 1) | 
       (df['rating_text_Excellent'] == 1), 'rating_binary'] = 2

In [139]:
print(df['rating_binary'].value_counts())

rating_binary
1    7871
2    2268
Name: count, dtype: int64


In [140]:
## Applying logistic regression keeping rating_binary as target variable

X = df[['cost', 'votes', 'cuisine_count']]   
y = df['rating_binary']       

In [141]:
#we are doing train-test split for (80/20) here
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [142]:
#implementing training for logistic regression
logistic_regression_model = LogisticRegression(max_iter=1000)
logistic_regression_model.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [143]:
#making predictions
y_prediction = logistic_regression_model.predict(X_test)

In [144]:
#evaluation with confusion matrix which shows how many restaurants were classified properly
confusion_matrix = confusion_matrix(y_test, y_prediction)


In [145]:
print("Confusion Matrix: ",confusion_matrix)

Confusion Matrix:  [[1522   52]
 [ 147  307]]


In [146]:
#Evaluation using precision which depicts the to what extent the predicition of classification was proper
precision = precision_score(y_test, y_prediction, pos_label=2)

In [147]:
print("PRECISION : ",precision)

PRECISION :  0.8551532033426184


In [148]:
#Evaluation for recall which explains how much the model correctly identified.
recall = recall_score(y_test, y_prediction, pos_label=2)

In [149]:
print("Recall: ", recall)

Recall:  0.6762114537444934


In [150]:
#Evaluation for F1 which explains how balanced, the values of precision and recall are
f1_Score = f1_score(y_test, y_prediction, pos_label=2)



In [151]:
print("F1 Score : ",f1_Score)

F1 Score :  0.7552275522755227


In [152]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score


In [153]:
## TRAININIG 3 MORE MODELS##

#Initiailizing a variable to store the results of each model and show it as a table

results_models = {}

In [154]:
##Random Forest model
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train, y_train)
y_pred_random_forest = random_forest_model.predict(X_test)

In [155]:
results_models['Random Forest'] = {
    'Confusion Matrix': confusion_matrix(y_test, y_pred_random_forest).tolist(),
    'Precision': round(precision_score(y_test, y_pred_random_forest, pos_label=2), 3),
    'Recall': round(recall_score(y_test, y_pred_random_forest, pos_label=2), 3),
    'F1': round(f1_score(y_test, y_pred_random_forest, pos_label=2), 3)
}

In [156]:
print(results_models['Random Forest'])

{'Confusion Matrix': [[1465, 109], [126, 328]], 'Precision': 0.751, 'Recall': 0.722, 'F1': 0.736}


In [157]:
###Gradient Boosted trees##
gradient_boost_model = GradientBoostingClassifier(random_state=42)
gradient_boost_model.fit(X_train, y_train)
y_pred_gradient_boost = gradient_boost_model.predict(X_test)

In [158]:
results_models['Gradient Boosted Trees'] = {
    'Confusion Matrix': confusion_matrix(y_test, y_pred_gradient_boost).tolist(),
    'Precision': round(precision_score(y_test, y_pred_gradient_boost, pos_label=2), 3),
    'Recall': round(recall_score(y_test, y_pred_gradient_boost, pos_label=2), 3),
    'F1': round(f1_score(y_test, y_pred_gradient_boost, pos_label=2), 3)
}

In [159]:
print(results_models['Gradient Boosted Trees'])

{'Confusion Matrix': [[1490, 84], [93, 361]], 'Precision': 0.811, 'Recall': 0.795, 'F1': 0.803}


In [160]:
### SVM Model ###
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm_model= svm_model.predict(X_test)

In [161]:
results_models['SVM'] = {
    'Confusion Matrix': confusion_matrix(y_test, y_pred_svm_model).tolist(),
    'Precision': round(precision_score(y_test, y_pred_svm_model, pos_label=2), 3),
    'Recall': round(recall_score(y_test, y_pred_svm_model, pos_label=2), 3),
    'F1': round(f1_score(y_test, y_pred_svm_model, pos_label=2), 3)
}

In [162]:
print(results_models['SVM'] )

{'Confusion Matrix': [[1494, 80], [100, 354]], 'Precision': 0.816, 'Recall': 0.78, 'F1': 0.797}


In [163]:
## converting the data we got into a table form for comparison

results_data_frame = pd.DataFrame(results_models).T
print(results_data_frame)

                                 Confusion Matrix Precision Recall     F1
Random Forest           [[1465, 109], [126, 328]]     0.751  0.722  0.736
Gradient Boosted Trees    [[1490, 84], [93, 361]]     0.811  0.795  0.803
SVM                      [[1494, 80], [100, 354]]     0.816   0.78  0.797


In [164]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.mllib.evaluation import MulticlassMetrics

from pyspark.sql.functions import col, when, sum as Fsum
from pyspark.sql.types import DoubleType
from pyspark.ml import Pipeline
from pyspark.ml.feature import Imputer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import RegressionEvaluator


In [165]:
from pyspark.sql import SparkSession

In [166]:

###this is being written for spark to identify the jdk path and to create the spark session.
import os, sys, subprocess, glob, time

# 1) Show what kernel you're on
print("PYTHON:", sys.executable)

# 2) Point to JDK 17 (Spark 4.x needs Java 17)
candidates = glob.glob(r"C:\Program Files\Eclipse Adoptium\jdk-17*")
if not candidates:
    raise RuntimeError("JDK 17 not found under 'C:\\Program Files\\Eclipse Adoptium'. Install Temurin 17.")
os.environ["JAVA_HOME"] = candidates[0]
os.environ["PATH"] = os.path.join(os.environ["JAVA_HOME"], "bin") + os.pathsep + os.environ["PATH"]

# 3) Make Spark use THIS Python
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

# 4) Bind locally to avoid Windows DNS/firewall hiccups
os.environ["SPARK_LOCAL_HOSTNAME"] = "localhost"
os.environ["SPARK_LOCAL_IP"]       = "127.0.0.1"

# 5) Quick Java sanity
print("JAVA_HOME:", os.environ["JAVA_HOME"])
print("java -version:\n", subprocess.run(["java","-version"], capture_output=True, text=True).stderr.splitlines()[0])

# 6) Start Spark robustly
from pyspark.sql import SparkSession
t0 = time.time()
spark = (
    SparkSession.builder
    .appName("Zomato-MLlib")
    .master("local[1]")                              # simplest startup
    .config("spark.driver.bindAddress","127.0.0.1")
    .config("spark.driver.host","127.0.0.1")
    .config("spark.ui.enabled","false")              # avoid UI port binding
    .config("spark.ui.showConsoleProgress","false")
    .config("spark.sql.shuffle.partitions","4")
    .config("spark.pyspark.python", sys.executable)  # belt & suspenders
    .config("spark.pyspark.driver.python", sys.executable)
    .getOrCreate()
)
print(f"Spark started ✓ in {time.time()-t0:.1f}s — version:", spark.version)
spark.range(5).show()


PYTHON: c:\Users\aravi\Desktop\data_Science_technologies\spark-venv\Scripts\python.exe
JAVA_HOME: C:\Program Files\Eclipse Adoptium\jdk-17.0.16.8-hotspot
java -version:
 openjdk version "17.0.16" 2025-07-15
Spark started ✓ in 0.0s — version: 4.0.1
+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [167]:
# Start Spark session for this notebook
spark = SparkSession.builder.appName("Zomato-MLlib").getOrCreate()
print("Spark started ✓")

Spark started ✓


In [168]:
#first we are bringing the data to spark
df_spark = spark.createDataFrame(df)        


In [169]:
#next we are taking some features 
from pyspark.sql.functions import col

In [None]:
#### pyspark showing regression using linear regression

from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import DoubleType
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder.appName("Zomato-Week7Style").getOrCreate()

# loading the dataset
df_new = spark.read.csv("zomato_df_final_data.csv", header=True, inferSchema=True)

# selected rating_number for prediction
label_col = "rating_number"
feature_cols = [c for c in ["cost", "votes", "cost_2"] if c in df_new.columns]
df = df_new.select([label_col] + feature_cols)

#data cleaning process
#Extracting numeric token from string values, cast to double; leave non-numeric as NULL
num_pat = r'([+-]?(?:\d+(?:\.\d+)?|\.\d+))'
for c in [label_col] + feature_cols:
    s = F.trim(F.col(c).cast("string"))
    tok = F.regexp_extract(s, num_pat, 1)
    df = df.withColumn(c, F.when(tok == "", None).otherwise(tok.cast(DoubleType())))

# undertaking imputation for missing values
df = df.filter(F.col(label_col).isNotNull()).na.fill(0, subset=feature_cols)

# pipeline stages
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="keep")
lr = LinearRegression(featuresCol="features", labelCol=label_col)

pipeline = Pipeline(stages=[assembler, lr])

# train- test split(80,20)
train, test = df.randomSplit([0.8, 0.2], seed=42)
model = pipeline.fit(train)

# calculating rmse and r2 values
pred = model.transform(test)
rmse = RegressionEvaluator(labelCol=label_col, predictionCol="prediction", metricName="rmse").evaluate(pred)
r2   = RegressionEvaluator(labelCol=label_col, predictionCol="prediction",  metricName="r2").evaluate(pred)

print(f"RMSE: {rmse:.3f}  |  R²: {r2:.3f}")




RMSE: 10.887  |  R²: -6.370


In [193]:
##classification using ml pipeline with logistic regression

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# loading the dataset
df_raw = spark.read.csv("zomato_df_final_data.csv", header=True, inferSchema=True)

# here we are selecting the columns like cost,votes, cost_2
feature_cols_all = ["cost", "votes", "cost_2"]
feature_cols = [c for c in feature_cols_all if c in df_raw.columns]
df = df_raw.select(["rating_text"] + feature_cols)

# for classification we use binary -- poor + average = 1, else it is 0
df = df.withColumn(
    "label",
    F.when(F.col("rating_text").isin("Poor", "Average"), 1.0).otherwise(0.0)
)

# cleaning of the features that are numeric in nature
num_pat = r'([+-]?(?:\d+(?:\.\d+)?|\.\d+))'
def coerce_numeric(df, cols):
    for c in cols:
        s = F.trim(F.col(c).cast("string"))
        tok = F.regexp_extract(s, num_pat, 1)
        df = df.withColumn(c, F.when(tok == "", None).otherwise(tok.cast(DoubleType())))
    return df

df = coerce_numeric(df, feature_cols)
df = df.na.fill(0, subset=feature_cols)

# implementing logistic regression
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="keep")
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=50)

#pipeline stage
pipeline = Pipeline(stages=[assembler, lr])

# train/test split - 80,20 
train, test = df.randomSplit([0.8, 0.2], seed=42)
model = pipeline.fit(train)

# predictions for the model
pred = model.transform(test).select("rating_text", "label", "prediction", "probability")
pred.show(10, truncate=False)

# evaluation of the prediction using BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="probability")
auc = evaluator.evaluate(pred)
print("The test AUC result:",round(auc,3))


+-----------+-----+----------+---------------------------------------+
|rating_text|label|prediction|probability                            |
+-----------+-----+----------+---------------------------------------+
|NULL       |0.0  |0.0       |[0.707919533089761,0.29208046691023903]|
|NULL       |0.0  |0.0       |[0.707919533089761,0.29208046691023903]|
|NULL       |0.0  |0.0       |[0.707919533089761,0.29208046691023903]|
|NULL       |0.0  |0.0       |[0.707919533089761,0.29208046691023903]|
|NULL       |0.0  |0.0       |[0.707919533089761,0.29208046691023903]|
|NULL       |0.0  |0.0       |[0.707919533089761,0.29208046691023903]|
|NULL       |0.0  |0.0       |[0.707919533089761,0.29208046691023903]|
|NULL       |0.0  |0.0       |[0.707919533089761,0.29208046691023903]|
|NULL       |0.0  |0.0       |[0.707919533089761,0.29208046691023903]|
|NULL       |0.0  |0.0       |[0.707919533089761,0.29208046691023903]|
+-----------+-----+----------+---------------------------------------+
only s

In [None]:
#The auc value of 0.597 shows that the model performs slightly better but not that strong as we used features which are simple like cost,votes etc

In [None]:
#-----Comparison of the results of Scikit-learn and Pyspark-----

In [None]:
#The Sci-kit learn was found to be more easier in the process as the pyspark required additional running of the kernel and steps.

#Accuracy
#In terms of accuracy the former was giving better results compared to pyspark.
#In terms of scalability pyspark seems to be better as it can scale more efficiently than Scikit-learn and show the cluster ready nature. The Sci-kit learn is more useful for smaller datasets.
#In terms of speed, Pyspark performs better with larger datasets and is slow with smaller datasets. But Scikit-learn is fast with smaller datasets.