## IMPORTS

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, sum


### READING DATASET

In [3]:
kickstarter_data= pd.read_csv('kick_starter.csv')

print(kickstarter_data.columns)
kickstarter_data.head()

Index(['ID', 'name', 'category', 'main_category', 'currency', 'deadline',
       'goal', 'launched', 'pledged', 'state', 'backers', 'country',
       'usd pledged', 'usd_pledged_real', 'usd_goal_real'],
      dtype='object')


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


## We noticed that we can group based on category and also create different bins to cluster companies with different goals and pledge amounts

In [5]:
print('Mean : ',kickstarter_data['usd_goal_real'].mean())
print('Maximum Value',kickstarter_data['usd_goal_real'].max())
print('Minimum Value',kickstarter_data['usd_goal_real'].min())
print('Standard deviation:',kickstarter_data['usd_goal_real'].std())
print('Median : ',kickstarter_data['usd_goal_real'].median())
print('variance : ',kickstarter_data['usd_goal_real'].var())

Mean :  45454.40146545326
Maximum Value 166361390.71
Minimum Value 0.01
Standard deviation: 1152950.0550888747
Median :  5500.0
variance :  1329293829529.4392


In [31]:
kickstarter_data['country'].value_counts()

country
US      292627
GB       33672
CA       14756
AU        7839
DE        4171
N,0"      3797
FR        2939
IT        2878
NL        2868
ES        2276
SE        1757
MX        1752
NZ        1447
DK        1113
IE         811
CH         761
NO         708
HK         618
BE         617
AT         597
SG         555
LU          62
JP          40
Name: count, dtype: int64

## Binning goal and pledged amounts

In [5]:
# Define bin edges and labels for goal and pledged amount
goal_bins = [0, 1000, 10000, 100000, 1000000, kickstarter_data['usd_goal_real'].max()]
goal_labels = ['Low', 'Medium', 'High', 'Very High', 'Extremely High']

pledged_bins = [0, 1000, 10000, 100000, 1000000, kickstarter_data['usd_pledged_real'].max()]
pledged_labels = ['Low', 'Medium', 'High', 'Very High', 'Extremely High']

# Create binned categories
kickstarter_data['goal_binned'] = pd.cut(kickstarter_data['usd_goal_real'], bins=goal_bins, labels=goal_labels)
kickstarter_data['pledged_binned'] = pd.cut(kickstarter_data['usd_pledged_real'], bins=pledged_bins, labels=pledged_labels)

# Analyze the relationship between binned categories and project state
goal_state_distribution = kickstarter_data.groupby(['goal_binned', 'state']).size().unstack().fillna(0)
pledged_state_distribution = kickstarter_data.groupby(['pledged_binned', 'state']).size().unstack().fillna(0)



  goal_state_distribution = kickstarter_data.groupby(['goal_binned', 'state']).size().unstack().fillna(0)
  pledged_state_distribution = kickstarter_data.groupby(['pledged_binned', 'state']).size().unstack().fillna(0)


### DISTRIBUTION OF GOAL AMOUNTS 

In [6]:
kickstarter_data['goal_binned'] = pd.cut(kickstarter_data['usd_goal_real'], bins=goal_bins, labels=goal_labels)

# Visualize the distribution of project goals after binning
goal_binned_distribution = kickstarter_data['goal_binned'].value_counts().sort_index().reset_index()
goal_binned_distribution.columns = ['Goal Binned', 'Frequency']

# Create the interactive bar chart
fig = px.bar(
    goal_binned_distribution,
    x='Goal Binned',
    y='Frequency',
    title='Distribution of Project Goals after Binning',
    labels={'Goal Binned': 'Binned Goal Categories', 'Frequency': 'Frequency'},
    color='Goal Binned'
)

# Show the plot
fig.show()

### DISTRIBUTION OF PLEDGE AMOUNTS

In [7]:
kickstarter_data['pledged_binned'] = pd.cut(kickstarter_data['usd_pledged_real'], bins=pledged_bins, labels=pledged_labels)

# Visualize the distribution of project pledges after binning
pledged_binned_distribution = kickstarter_data['pledged_binned'].value_counts().sort_index().reset_index()
pledged_binned_distribution.columns = ['Pledged Binned', 'Frequency']

# Create the interactive bar chart
fig = px.bar(
    pledged_binned_distribution,
    x='Pledged Binned',
    y='Frequency',
    title='Distribution of Project Pledges after Binning',
    labels={'Pledged Binned': 'Binned Pledge Categories', 'Frequency': 'Frequency'},
    color='Pledged Binned'
)

# Show the plot
fig.show()

### Distribution of project goals after binning, grouped by the main categories.

In [8]:
category_goal_binned_distribution = kickstarter_data.groupby(['main_category', 'goal_binned']).size().unstack().fillna(0)

# Convert the DataFrame to long format for Plotly
category_goal_binned_distribution = category_goal_binned_distribution.reset_index()
category_goal_binned_distribution_melted = category_goal_binned_distribution.melt(id_vars=['main_category'], value_vars=goal_labels, var_name='Goal Bins', value_name='Frequency')

# Create the interactive grouped bar chart
fig = px.bar(
    category_goal_binned_distribution_melted,
    x='main_category',
    y='Frequency',
    color='Goal Bins',
    title='Binned Goal Distribution by Main Category',
    labels={'main_category': 'Main Category', 'Frequency': 'Frequency'},
    barmode='group'
)

fig.show()





In [9]:
# Visualize the distribution of project states
project_state_distribution = kickstarter_data['state'].value_counts().reset_index()
project_state_distribution.columns = ['Project State', 'Frequency']

# Create the interactive bar chart
fig = px.bar(
    project_state_distribution,
    x='Project State',
    y='Frequency',
    title='Distribution of Project States',
    labels={'Project State': 'Project State', 'Frequency': 'Frequency'},
    color='Project State'
)

# Show the plot
fig.show()

### Different states across categories

In [18]:
state_category_distribution = kickstarter_data.groupby(['main_category', 'state']).size().reset_index(name='Frequency')

# Create the interactive bar chart
fig = px.bar(
    state_category_distribution,
    x='main_category',
    y='Frequency',
    color='state',
    title='Distribution of Project States by Main Category',
    labels={'main_category': 'Main Category', 'Frequency': 'Frequency', 'state': 'Project State'},
    barmode='group'
)

# Show the plot
fig.show()


# Initializing session

In [20]:
spark = SparkSession.builder \
    .appName("Kickstarter Analysis") \
    .getOrCreate()

## Loading data set onto spark

In [21]:
kickstarter_df = spark.read.csv('kick_starter.csv', header=True, inferSchema=True)

kickstarter_df.printSchema()


root
 |-- ID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- main_category: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- deadline: string (nullable = true)
 |-- goal: string (nullable = true)
 |-- launched: string (nullable = true)
 |-- pledged: string (nullable = true)
 |-- state: string (nullable = true)
 |-- backers: string (nullable = true)
 |-- country: string (nullable = true)
 |-- usd pledged: string (nullable = true)
 |-- usd_pledged_real: string (nullable = true)
 |-- usd_goal_real: string (nullable = true)



## Creating temp view

In [22]:
kickstarter_df.createOrReplaceTempView("kickstarter")


### Distribution of project states

In [28]:
dist_states = kickstarter_df.groupBy("state").agg(count("*").alias("count")).orderBy(col("count").desc())
dist_states.show(100)

+----------+------+
|     state| count|
+----------+------+
|    failed|197052|
|successful|133429|
|  canceled| 38687|
| undefined|  3556|
|      live|  2796|
| suspended|  1844|
|      0.00|   117|
|     25.00|    16|
|     10.00|    15|
|      5.00|    13|
|     50.00|    12|
|    100.00|    11|
|     20.00|    10|
|      1.00|    10|
|    200.00|     8|
|     70.00|     7|
|    260.00|     6|
|      2.00|     6|
|     75.00|     6|
|      NULL|     5|
|   2500.00|     5|
|     90.00|     5|
|     30.00|     5|
|     26.00|     5|
|   1000.00|     4|
|     40.00|     4|
|     15.00|     4|
|    160.00|     4|
|     35.00|     4|
|    201.00|     4|
|    105.00|     4|
|   2000.00|     4|
|    125.00|     3|
|   1600.00|     3|
|    600.00|     3|
|    400.00|     3|
|      6.00|     3|
|   1500.00|     3|
|    275.00|     3|
|    110.00|     3|
|    150.00|     3|
|     61.00|     3|
|     36.00|     3|
|    230.00|     3|
|    380.00|     3|
|    175.00|     3|
|    255.00|     3|


### Average goal by main category

In [29]:
avg_goal_per_category = kickstarter_df.groupBy("main_category").agg(avg("usd_goal_real").alias("avg_goal")).orderBy(col("avg_goal").desc())
avg_goal_per_category.show()

+--------------------+------------------+
|       main_category|          avg_goal|
+--------------------+------------------+
|          Technology|102337.16692853857|
|          Journalism| 86126.77848504003|
|        Film & Video|  82810.4782493369|
|        & Destiny"""|           65500.9|
|                Food| 49050.42487737421|
|               Games| 44933.13868885985|
|              Design| 41851.69322876079|
|                 Art| 39280.61011753392|
|           by Blule"|          34063.49|
|             Theater|27116.552577622635|
|       The Movement"|          27029.88|
| and universal so...|           25499.0|
|          Publishing| 24875.51560176543|
|     and Burnouts"""|           22540.0|
|             Fashion|22407.276651895387|
|"" A Children's B...|          20123.47|
|              Comics|19554.330581922477|
|             Redux!"|           16735.0|
|               Music|15385.464899293806|
|  I'll record a C...|           15220.0|
+--------------------+------------

### Total pledged amount by country

In [30]:
total_pledged_by_country = kickstarter_df.groupBy("country").agg(sum("usd_pledged_real").alias("total_pledged")).orderBy(col("total_pledged").desc())
total_pledged_by_country.show()

+-------+--------------------+
|country|       total_pledged|
+-------+--------------------+
|     US| 2.825423358940002E9|
|     GB|2.3649768392999986E8|
|     CA| 9.176871059000003E7|
|     AU|4.6639553220000006E7|
|     DE| 3.976059530999999E7|
|     FR|        3.43437367E7|
|     NL|       2.924278234E7|
|     IT|1.6374247670000002E7|
|     SE|         1.3409567E7|
|     CH|1.3043361440000003E7|
|     ES|1.2919994580000002E7|
|  N,0""|1.1181758739999998E7|
|     HK|1.0029484940000001E7|
|     AT|          9665788.56|
|     NZ|   7821407.890000001|
|     DK|   7582896.989999999|
|     SG|   5842134.369999999|
|     IE|          4187029.79|
|     BE|   4181557.589999999|
|     NO|          2668032.83|
+-------+--------------------+
only showing top 20 rows



## We can conclude after the exploration above; that majority of the projects had put a pledge between 0 and 1000 usd and the most common project state was failed, there could be underlying correlation between these variables , furthermore majority of the expected goal amount fell in the medium bin across most categories except in the fields of tech , food and design ;this may indicate that some industries could have higher expectations in terms of achieving the goal. Worth mentioning that there are undefined states belonging to country NO" that are to be removed in data cleaning as they are hard to estimate and better to drop to not affect the model judgement moving on.