In [1]:
import os
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'
print(os.getenv('TF_GPU_ALLOCATOR'))

cuda_malloc_async


In [2]:
import findspark
findspark.init()

In [3]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
import numpy as np
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql import Window

sc= SparkContext()
sqlContext = SQLContext(sc)



In [4]:
df = sqlContext.read.load(r"C:\Users\apurv\Downloads\ipl\ipldata.csv", 
                      format='com.databricks.spark.csv', 
                      header='true', 
                      inferSchema='true')
df.show()

+---+--------+------+--------+----+----------+-----------+---------+------------+------+-----+----------+-----------+----------+--------+-------------+----------------+-------+-------------+----+----------+---------------+-------------+-------------+----------+--------------------+----+------+--------+----------+
|_c0|match id|inning|delivery|over|   batsman|non striker|   bowler|runs off bat|extras|total|extra kind|wicket kind|player out|fielders|        team1|           team2|outcome|       winner|  by|win amount|player of match|  toss winner|toss decision|match type|               venue|city|gender| umpire1|   umpire2|
+---+--------+------+--------+----+----------+-----------+---------+------------+------+-----+----------+-----------+----------+--------+-------------+----------------+-------+-------------+----+----------+---------------+-------------+-------------+----------+--------------------+----+------+--------+----------+
|  0|  598067|     1|     0.1| 1.0|RV Uthappa|   AJ Fin

In [5]:
df.cache()
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- match id: integer (nullable = true)
 |-- inning: integer (nullable = true)
 |-- delivery: double (nullable = true)
 |-- over: double (nullable = true)
 |-- batsman: string (nullable = true)
 |-- non striker: string (nullable = true)
 |-- bowler: string (nullable = true)
 |-- runs off bat: integer (nullable = true)
 |-- extras: integer (nullable = true)
 |-- total: integer (nullable = true)
 |-- extra kind: string (nullable = true)
 |-- wicket kind: string (nullable = true)
 |-- player out: string (nullable = true)
 |-- fielders: string (nullable = true)
 |-- team1: string (nullable = true)
 |-- team2: string (nullable = true)
 |-- outcome: string (nullable = true)
 |-- winner: string (nullable = true)
 |-- by: string (nullable = true)
 |-- win amount: integer (nullable = true)
 |-- player of match: string (nullable = true)
 |-- toss winner: string (nullable = true)
 |-- toss decision: string (nullable = true)
 |-- match type: string (nullab

# Batsman prediction

In [6]:
batsman=input("Enter the batsman name: ")
df2=df.filter(df.batsman == batsman)
df2.show()

from pyspark.sql.functions import col
main_col=df2.select(col("inning"),col('over'),col('delivery'),col('runs off bat'),col('extras'),col('total'))

Enter the batsman name: RV Uthappa
+---+--------+------+--------+----+----------+-----------+---------+------------+------+-----+----------+-----------+----------+--------+-------------+----------------+-------+-------------+----+----------+---------------+-------------+-------------+----------+--------------------+----+------+--------+----------+
|_c0|match id|inning|delivery|over|   batsman|non striker|   bowler|runs off bat|extras|total|extra kind|wicket kind|player out|fielders|        team1|           team2|outcome|       winner|  by|win amount|player of match|  toss winner|toss decision|match type|               venue|city|gender| umpire1|   umpire2|
+---+--------+------+--------+----+----------+-----------+---------+------------+------+-----+----------+-----------+----------+--------+-------------+----------------+-------+-------------+----+----------+---------------+-------------+-------------+----------+--------------------+----+------+--------+----------+
|  0|  598067|     1

In [7]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['delivery'], outputCol = 'features')
vhouse_df = vectorAssembler.transform(main_col)
vhouse_df = vhouse_df.select(['features', 'runs off bat'])
vhouse_df.show(3)

+--------+------------+
|features|runs off bat|
+--------+------------+
|   [0.1]|           0|
|   [0.2]|           2|
|   [0.3]|           4|
+--------+------------+
only showing top 3 rows



In [8]:
splits = vhouse_df.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

Linear Regression

In [9]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='runs off bat', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()

Coefficients: [0.0]
Intercept: 1.2854420731707317
RMSE: 1.659599
r2: 0.000000
numIterations: 0
objectiveHistory: [0.4999999999999999]




+--------------------+
|           residuals|
+--------------------+
| -1.2854420731707317|
| -1.2854420731707317|
| -1.2854420731707317|
| -1.2854420731707317|
| -1.2854420731707317|
|  0.7145579268292683|
| -1.2854420731707317|
|-0.28544207317073167|
|  2.7145579268292686|
| -1.2854420731707317|
|-0.28544207317073167|
|  2.7145579268292686|
|-0.28544207317073167|
|  2.7145579268292686|
| -1.2854420731707317|
| -1.2854420731707317|
| -1.2854420731707317|
| -1.2854420731707317|
|-0.28544207317073167|
|  2.7145579268292686|
+--------------------+
only showing top 20 rows



K Means Clustering

In [10]:
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.clustering import KMeans
# Trains a k-means model.
kmeans = KMeans().setK(7).setSeed(1)
model = kmeans.fit(vhouse_df)
# Make predictions
predictions = model.transform(vhouse_df)
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))
print("Cluster Centers: ")
ctr=[]
centers = model.clusterCenters()
for center in centers:
    ctr.append(center)
    print(center)

Silhouette with squared euclidean distance = 0.6982550765454786
Cluster Centers: 
[1.08977099]
[16.93630769]
[13.20189573]
[3.57800312]
[8.12269044]
[5.83938849]
[10.53977273]


Logistic Regression

In [13]:
from pyspark.sql import Row
from pyspark.ml.classification import LogisticRegression
c = LogisticRegression(weightCol="delivery")

from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
b = vhouse_df
b.show()

c.setRegParam(0.01)
c.getRegParam()
c.setMaxIter(10)
c.getMaxIter()
c.setFeaturesCol("features")

+--------+------------+
|features|runs off bat|
+--------+------------+
|   [0.1]|           0|
|   [0.2]|           2|
|   [0.3]|           4|
|   [0.4]|           0|
|   [0.5]|           1|
|   [1.1]|           0|
|   [1.2]|           3|
|   [1.4]|           1|
|   [2.1]|           1|
|   [2.4]|           1|
|   [3.1]|           4|
|   [3.2]|           1|
|   [3.6]|           1|
|   [4.1]|           0|
|   [4.2]|           0|
|   [4.3]|           0|
|   [4.4]|           0|
|   [4.5]|           4|
|   [4.6]|           1|
|   [5.1]|           0|
+--------+------------+
only showing top 20 rows



LogisticRegression_dfa2bcf7cace

# Bowler prediction

In [14]:
bowler=input("Enter the bowler name: ")
df3=df.filter(df.batsman == bowler)
df3.show()
from pyspark.sql.functions import col
  
main_col=df3.select(col("inning"),col('over'),col('delivery'),col('runs off bat'),col('extras'),col('total'))
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['delivery'], outputCol = 'features')
vhouse_df = vectorAssembler.transform(main_col)
vhouse_df = vhouse_df.select(['features', 'runs off bat'])
vhouse_df.show(3)
splits = vhouse_df.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

Enter the bowler name: S Kaul
+-----+--------+------+--------+----+-------+--------------+-----------------+------------+------+-----+----------+-----------+----------+---------+-------------------+--------------------+-------+--------------------+-------+----------+---------------+--------------------+-------------+----------+--------------------+---------+------+-------------+-----------+
|  _c0|match id|inning|delivery|over|batsman|   non striker|           bowler|runs off bat|extras|total|extra kind|wicket kind|player out| fielders|              team1|               team2|outcome|              winner|     by|win amount|player of match|         toss winner|toss decision|match type|               venue|     city|gender|      umpire1|    umpire2|
+-----+--------+------+--------+----+-------+--------------+-----------------+------------+------+-----+----------+-----------+----------+---------+-------------------+--------------------+-------+--------------------+-------+----------+-----

Linear Regression

In [15]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='runs off bat', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()

Coefficients: [0.1893079464237209]
Intercept: -2.7950708501873414
RMSE: 0.849195
r2: 0.223749
numIterations: 2
objectiveHistory: [0.5, 0.4850190203160912, 0.45937348907875664]
+--------------------+
|           residuals|
+--------------------+
| 0.16035827885190024|
|  3.0846351002824117|
|-0.08240993545321595|
|-0.12027152473796043|
|-0.13920231938033245|
|  -0.271717881876937|
| -0.2906486765193095|
|-0.30957947116168105|
| -0.3285102658040535|
| 0.14142748420952778|
|  0.3496662252756213|
| 0.16035827885190024|
| -0.5367490068701466|
| -0.6503337747243787|
| -0.6881953640091227|
| -0.7260569532938677|
|  0.1224966895671562|
| -0.5367490068701466|
| -0.6503337747243787|
| 0.33073543063324884|
+--------------------+
only showing top 20 rows



K Means Clustering

In [16]:
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.clustering import KMeans
# Trains a k-means model.
kmeans = KMeans().setK(7).setSeed(1)
model = kmeans.fit(vhouse_df)
# Make predictions
predictions = model.transform(vhouse_df)
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))
print("Cluster Centers: ")
ctr=[]
centers = model.clusterCenters()
for center in centers:
    ctr.append(center)
    print(center)

Silhouette with squared euclidean distance = 0.8137728195066373
Cluster Centers: 
[18.39166667]
[15.4]
[19.5]
[16.35]
[19.6]
[17.53333333]
[19.31428571]


Logistic Regression

In [17]:
from pyspark.sql import Row
from pyspark.ml.classification import LogisticRegression
c = LogisticRegression(weightCol="runs off bat")

from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
b = vhouse_df
b.show()

c.setRegParam(0.01)
c.getRegParam()
c.setMaxIter(10)
c.getMaxIter()
c.setFeaturesCol("features")

+--------+------------+
|features|runs off bat|
+--------+------------+
|  [18.4]|           0|
|  [18.5]|           1|
|  [19.2]|           1|
|  [19.4]|           1|
|  [19.6]|           4|
|  [17.4]|           0|
|  [15.2]|           0|
|  [15.3]|           2|
|  [15.4]|           0|
|  [15.5]|           0|
|  [15.6]|           0|
|  [16.2]|           0|
|  [16.3]|           0|
|  [16.4]|           0|
|  [16.5]|           0|
|  [19.3]|           1|
|  [18.2]|           1|
|  [19.2]|           1|
|  [17.6]|           0|
|  [18.2]|           0|
+--------+------------+
only showing top 20 rows



LogisticRegression_8d7acebd84db