# Facebook Data Analysis

### Imports

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, IntegerType, DoubleType
from pyspark.sql.functions import regexp_replace, col
from pyspark.sql.functions import *

### Create a Spark session

In [2]:
spark = SparkSession.builder.appName('Facebook_analysis').getOrCreate()

### Upload the dataset

In [3]:
df = spark.read.csv('pseudo_facebook.csv', header = True, escape = '"', inferSchema = True)

# Show the first 5 rows
df.show(5)

+-------+---+-------+--------+---------+------+------+------------+---------------------+-----+--------------+------------+---------------------+---------+------------------+
| userid|age|dob_day|dob_year|dob_month|gender|tenure|friend_count|friendships_initiated|likes|likes_received|mobile_likes|mobile_likes_received|www_likes|www_likes_received|
+-------+---+-------+--------+---------+------+------+------------+---------------------+-----+--------------+------------+---------------------+---------+------------------+
|2094382| 14|     19|    1999|       11|  male|   266|           0|                    0|    0|             0|           0|                    0|        0|                 0|
|1192601| 14|      2|    1999|       11|female|     6|           0|                    0|    0|             0|           0|                    0|        0|                 0|
|2083884| 14|     16|    1999|       11|  male|    13|           0|                    0|    0|             0|           0|  

In [4]:
# Columns of the dataset
df.columns

['userid',
 'age',
 'dob_day',
 'dob_year',
 'dob_month',
 'gender',
 'tenure',
 'friend_count',
 'friendships_initiated',
 'likes',
 'likes_received',
 'mobile_likes',
 'mobile_likes_received',
 'www_likes',
 'www_likes_received']

In [5]:
# Data types
df.dtypes

[('userid', 'int'),
 ('age', 'int'),
 ('dob_day', 'int'),
 ('dob_year', 'int'),
 ('dob_month', 'int'),
 ('gender', 'string'),
 ('tenure', 'string'),
 ('friend_count', 'int'),
 ('friendships_initiated', 'int'),
 ('likes', 'int'),
 ('likes_received', 'int'),
 ('mobile_likes', 'int'),
 ('mobile_likes_received', 'int'),
 ('www_likes', 'int'),
 ('www_likes_received', 'int')]

In [6]:
# Spark schema
df.printSchema()

root
 |-- userid: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- dob_day: integer (nullable = true)
 |-- dob_year: integer (nullable = true)
 |-- dob_month: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- tenure: string (nullable = true)
 |-- friend_count: integer (nullable = true)
 |-- friendships_initiated: integer (nullable = true)
 |-- likes: integer (nullable = true)
 |-- likes_received: integer (nullable = true)
 |-- mobile_likes: integer (nullable = true)
 |-- mobile_likes_received: integer (nullable = true)
 |-- www_likes: integer (nullable = true)
 |-- www_likes_received: integer (nullable = true)



### Check existence of null values

In [7]:
null_counts = df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns])

null_counts.show()

+------+---+-------+--------+---------+------+------+------------+---------------------+-----+--------------+------------+---------------------+---------+------------------+
|userid|age|dob_day|dob_year|dob_month|gender|tenure|friend_count|friendships_initiated|likes|likes_received|mobile_likes|mobile_likes_received|www_likes|www_likes_received|
+------+---+-------+--------+---------+------+------+------------+---------------------+-----+--------------+------------+---------------------+---------+------------------+
|     0|  0|      0|       0|        0|     0|     0|           0|                    0|    0|             0|           0|                    0|        0|                 0|
+------+---+-------+--------+---------+------+------+------------+---------------------+-----+--------------+------------+---------------------+---------+------------------+



### Create a Temporary SQL view

In [8]:
# Creates a Virtual SQL environment
df.createOrReplaceTempView('fb')

In [9]:
query = "SELECT * FROM fb"
result = spark.sql(query)
result.show(5)

+-------+---+-------+--------+---------+------+------+------------+---------------------+-----+--------------+------------+---------------------+---------+------------------+
| userid|age|dob_day|dob_year|dob_month|gender|tenure|friend_count|friendships_initiated|likes|likes_received|mobile_likes|mobile_likes_received|www_likes|www_likes_received|
+-------+---+-------+--------+---------+------+------+------------+---------------------+-----+--------------+------------+---------------------+---------+------------------+
|2094382| 14|     19|    1999|       11|  male|   266|           0|                    0|    0|             0|           0|                    0|        0|                 0|
|1192601| 14|      2|    1999|       11|female|     6|           0|                    0|    0|             0|           0|                    0|        0|                 0|
|2083884| 14|     16|    1999|       11|  male|    13|           0|                    0|    0|             0|           0|  

#### Total number of records

In [10]:
query = "SELECT count(*) AS Total_records FROM fb"
result = spark.sql(query).show()

+-------------+
|Total_records|
+-------------+
|        99003|
+-------------+



#### Create a python variable from a table value

If we want to create a python variable we have to treat it as 2 dimensional array.

In [34]:
# We are taking the value from de 1º column and 1º line [0][0]
x = spark.sql("SELECT count(*) AS Total_records FROM fb").collect()[0][0]
print('Original number of records: ', x) 

# Add 20 to initial number of records in the dataset
print('Number of records + 20: ', x + 20)

Original number of records:  99003
Number of records + 20:  99023


### Diferences based on the gender

#### Number of males, females and other (NA)

In [28]:
query = "SELECT gender, count(*) AS total_number FROM fb GROUP BY gender ORDER BY total_number DESC"
result = spark.sql(query).show()

+------+------------+
|gender|total_number|
+------+------------+
|  male|       58574|
|female|       40254|
|    NA|         175|
+------+------------+



585774 of the users are men, 40254 are women and 175 identify themselves as other.

#### Average age of males, females and other (NA)

In [13]:
query = "SELECT gender, AVG(age) AS Average_age FROM fb GROUP BY gender"
result = spark.sql(query).show()

+------+------------------+
|gender|       Average_age|
+------+------------------+
|    NA| 74.77714285714286|
|female|39.459904605753465|
|  male| 35.67024618431386|
+------+------------------+



The average age for women is about 39 years old, for men is about 36 years old and for other is about 75 years old.

#### Diference between likes recived by male, female and other

In [15]:
query = "SELECT gender, sum(likes_received) AS likes_received FROM fb GROUP BY gender ORDER BY likes_received DESC"
result = spark.sql(query).show()

+------+--------------+
|gender|likes_received|
+------+--------------+
|female|      10121282|
|  male|       3977851|
|    NA|         27542|
+------+--------------+



Women recive way more likes than man and other.

#### Diference between likes given by male, female and other

In [30]:
query = "SELECT gender, sum(likes) AS likes_given FROM fb GROUP BY gender ORDER BY likes_given DESC"
result = spark.sql(query).show()

+------+-----------+
|gender|likes_given|
+------+-----------+
|female|   10468106|
|  male|    4959923|
|    NA|      24239|
+------+-----------+



Women also give more likes than man and other.

#### Average friends depending on the gender

In [21]:
query = "SELECT gender, AVG(friend_count) AS avg_friends FROM fb GROUP BY gender ORDER BY avg_friends DESC"
result = spark.sql(query).show()

+------+------------------+
|gender|       avg_friends|
+------+------------------+
|female|241.96994087544095|
|    NA|184.41142857142856|
|  male|165.03545941885477|
+------+------------------+



On average women have more friends on Facebook, followed by other, and men.

#### How initiate more friendships

In [35]:
query = "SELECT gender, AVG(friendships_initiated) AS avg_friendships_init FROM fb GROUP BY gender ORDER BY avg_friendships_init DESC"
result = spark.sql(query).show()

+------+--------------------+
|gender|avg_friendships_init|
+------+--------------------+
|female|  113.89909077358772|
|  male|  103.06659951514324|
|    NA|   92.57142857142857|
+------+--------------------+



On average women initiate more friendships, followed by men, and other.

### Age demographics  

#### Average age of a Facebook user

In [12]:
query = "SELECT AVG(age) AS Average_age FROM fb"
result = spark.sql(query).show()

+-----------------+
|      Average_age|
+-----------------+
|37.28022383160106|
+-----------------+



The average age of a Facebook user is about 37 years old.

#### Whats the average of friends that people with ages between 13 and 25 have.

In [16]:
query = "SELECT AVG(friend_count) AS avg_friends FROM fb WHERE age >= 13 AND age <= 25 "
result = spark.sql(query).show()

+-----------------+
|      avg_friends|
+-----------------+
|268.5517990880525|
+-----------------+



#### Whats the average of friends that people with ages between 30 and 50 have.

In [17]:
query = "SELECT AVG(friend_count) AS avg_friends FROM fb WHERE age >= 30 AND age <= 50 "
result = spark.sql(query).show()

+------------------+
|       avg_friends|
+------------------+
|106.38998921251348|
+------------------+



Younger people, with ages between 30 and 50 have more friends on average than older people, with ages between 30 and 50.

#### Whats the average usage of mobile and website for people with ages between 13 and 25 

In [18]:
query = "SELECT AVG(mobile_likes) AS avg_mobilelikes, AVG(www_likes) AS avg_wwwlikes FROM fb WHERE age >= 13 AND age <= 25 "
result = spark.sql(query).show()

+------------------+-----------------+
|   avg_mobilelikes|     avg_wwwlikes|
+------------------+-----------------+
|123.98981737425284|55.50010631511801|
+------------------+-----------------+



#### Whats the average usage of mobile and website for people with ages between 30 and 50 

In [19]:
query = "SELECT AVG(mobile_likes) AS avg_mobilelikes, AVG(www_likes) AS avg_wwwlikes FROM fb WHERE age >= 30 AND age <= 50 "
result = spark.sql(query).show()

+-----------------+------------------+
|  avg_mobilelikes|      avg_wwwlikes|
+-----------------+------------------+
|99.11257820927725|32.016871628910465|
+-----------------+------------------+



#### Whats the average usage of mobile and website for people above 50

In [20]:
query = "SELECT AVG(mobile_likes) AS avg_mobilelikes, AVG(www_likes) AS avg_wwwlikes FROM fb WHERE age >50 "
result = spark.sql(query).show()

+----------------+-----------------+
| avg_mobilelikes|     avg_wwwlikes|
+----------------+-----------------+
|89.4766420939286|69.11944007288992|
+----------------+-----------------+



Despite the differences of ages, the mobile application have higher usage than the web page.

#### Top 5 users with more friends on Facebook

In [40]:
query = "SELECT userid, gender, friend_count FROM fb ORDER BY friend_count DESC LIMIT 5"
result = spark.sql(query).show()

+-------+------+------------+
| userid|gender|friend_count|
+-------+------+------------+
|2090699|female|        4923|
|1660276|  male|        4917|
|1926655|female|        4863|
|1685573|  male|        4845|
|1386477|  male|        4844|
+-------+------+------------+



The users ids with more friends are 2090699, 1660276, 1926655, 1685573, 1386477.

#### Top 5 users that recived more likes on Facebook

In [38]:
query = "SELECT userid, gender, likes_received FROM fb ORDER BY likes_received DESC LIMIT 5"
result = spark.sql(query).show()

+-------+------+--------------+
| userid|gender|likes_received|
+-------+------+--------------+
|1674584|female|        261197|
|1441676|female|        178166|
|1715925|female|        152014|
|2063006|female|        106025|
|1053087|  male|         82623|
+-------+------+--------------+



The users ids who recived more likes are 1674584, 1441676, 1715925, 2063006, 1053087.

#### Top 5 users who gave more likes on Facebook

In [39]:
query = "SELECT userid, gender, likes FROM fb ORDER BY likes DESC LIMIT 5"
result = spark.sql(query).show()

+-------+------+-----+
| userid|gender|likes|
+-------+------+-----+
|1684195|  male|25111|
|1656477|  male|21652|
|1489463|female|16732|
|1429178|female|16583|
|1267229|female|14799|
+-------+------+-----+



The users ids who recived more likes are 1684195, 1656477, 1489463, 1429178, 1267229.