<a href="https://colab.research.google.com/github/Franciscotor1/Cotizacion-autos/blob/master/BaldursGate3Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Instala Java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Descarga e instala Apache Spark
!wget -q https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark

# Configura las variables de entorno
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

# Añade Spark al sistema
import findspark
findspark.init()

# Crea una sesión Spark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("example").getOrCreate()


In [2]:
spark.version

'3.1.2'

In [3]:
from pyspark.sql import SparkSession

# Crear una sesión Spark
spark = SparkSession.builder.master("local[*]").appName("example").getOrCreate()

# Cargar el archivo CSV en un DataFrame de Spark
file_path = "/content/baldursgate3.csv"  # Asegúrate de que la ruta sea correcta
df = spark.read.csv(file_path, header=True, inferSchema=True)


In [4]:
df.printSchema()

root
 |-- recommendationid: string (nullable = true)
 |-- language: string (nullable = true)
 |-- review: string (nullable = true)
 |-- timestamp_created: string (nullable = true)
 |-- timestamp_updated: string (nullable = true)
 |-- voted_up: string (nullable = true)
 |-- votes_up: string (nullable = true)
 |-- votes_funny: string (nullable = true)
 |-- weighted_vote_score: string (nullable = true)
 |-- written_during_early_access: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- steam_purchase: string (nullable = true)
 |-- received_for_free: string (nullable = true)



In [5]:
df.show(5)


+----------------+--------+--------------------+-----------------+-----------------+--------+--------+-----------+-------------------+---------------------------+-------------+--------------+-----------------+
|recommendationid|language|              review|timestamp_created|timestamp_updated|voted_up|votes_up|votes_funny|weighted_vote_score|written_during_early_access|comment_count|steam_purchase|received_for_free|
+----------------+--------+--------------------+-----------------+-----------------+--------+--------+-----------+-------------------+---------------------------+-------------+--------------+-----------------+
|       153560814| english|This game hits al...|       1702542971|       1702542971|    TRUE|       0|          0|                  0|                      FALSE|            0|          TRUE|            FALSE|
|       153560623| english|took me like 11 h...|       1702542657|       1702542657|    TRUE|       0|          0|                  0|                      FALS

In [6]:
total_reviews = df.count()
print("Número total de revisiones:", total_reviews)


Número total de revisiones: 518102


In [7]:
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, BooleanType, DoubleType, LongType

# Convertir columnas a tipos de datos apropiados
df = df.withColumn("timestamp_created", df["timestamp_created"].cast(LongType()))
df = df.withColumn("timestamp_updated", df["timestamp_updated"].cast(LongType()))
df = df.withColumn("voted_up", df["voted_up"].cast(BooleanType()))
df = df.withColumn("votes_up", df["votes_up"].cast(IntegerType()))
df = df.withColumn("votes_funny", df["votes_funny"].cast(IntegerType()))
df = df.withColumn("weighted_vote_score", df["weighted_vote_score"].cast(DoubleType()))
df = df.withColumn("written_during_early_access", df["written_during_early_access"].cast(BooleanType()))
df = df.withColumn("comment_count", df["comment_count"].cast(IntegerType()))
df = df.withColumn("steam_purchase", df["steam_purchase"].cast(BooleanType()))
df = df.withColumn("received_for_free", df["received_for_free"].cast(BooleanType()))

# Mostrar el esquema actualizado
df.printSchema()


root
 |-- recommendationid: string (nullable = true)
 |-- language: string (nullable = true)
 |-- review: string (nullable = true)
 |-- timestamp_created: long (nullable = true)
 |-- timestamp_updated: long (nullable = true)
 |-- voted_up: boolean (nullable = true)
 |-- votes_up: integer (nullable = true)
 |-- votes_funny: integer (nullable = true)
 |-- weighted_vote_score: double (nullable = true)
 |-- written_during_early_access: boolean (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- steam_purchase: boolean (nullable = true)
 |-- received_for_free: boolean (nullable = true)



In [8]:
df.describe(["votes_up", "votes_funny", "comment_count", "weighted_vote_score"]).show()


+-------+--------------------+--------------------+-------------------+-------------------+
|summary|            votes_up|         votes_funny|      comment_count|weighted_vote_score|
+-------+--------------------+--------------------+-------------------+-------------------+
|  count|              295193|              269819|             258145|             287623|
|   mean|1.5473706718496034E7|1.0159547688502293E7|  2362349.401371322|  5810425.221443001|
| stddev|  1.60572080755906E8|1.3026095585011157E8|6.296448838106158E7|9.863587472439855E7|
|    min|                   0|                   0|                  0|                0.0|
|    max|          1702532552|          1702499012|         1702372383|      1.702512969E9|
+-------+--------------------+--------------------+-------------------+-------------------+



In [9]:
df.groupBy("language").agg({"votes_up": "avg", "comment_count": "sum"}).show()


+--------------------+------------------+-------------+
|            language|sum(comment_count)|avg(votes_up)|
+--------------------+------------------+-------------+
|                 -' |              null|         null|
|          1702523972|              null|          0.0|
| I really wish I ...|              null|         null|
| and details are ...|              null|         null|
| with its beautif...|              null|         null|
|          1701880924|              null|          0.0|
|          1701696847|              null|          0.0|
|                 far|              null|         null|
| especially fans ...|              null|         null|
| I had no idea wh...|              null|         null|
| my character suc...|              null|          0.0|
| luckily BG3 skip...|              null|         null|
| the storytelling...|              null|         null|
| but by god they ...|              null|         null|
|          1701181392|              null|       

In [10]:
df_cleaned = df.na.drop(subset=["comment_count", "votes_up"])
summary_result = df_cleaned.groupBy("language").agg({"comment_count": "sum", "votes_up": "avg"})
summary_result.show()


+--------------------+------------------+-------------+
|            language|sum(comment_count)|avg(votes_up)|
+--------------------+------------------+-------------+
| because it mimic...|                 0|          1.0|
| after almost 4 m...|                 0|1.700860764E9|
| detailed charact...|                 0|          0.0|
| Baldurs Gate 3 i...|                 0|1.700620375E9|
|  great with friends|                 0|          0.0|
| with a lot of co...|                 0|          0.0|
| Baldur's Gate 3 ...|                 0|          1.0|
| BG3 is a masterp...|                 0|          0.0|
|           reactions|                 0|          1.0|
| and the game con...|                 0|          1.0|
|      character menu|                 0|          4.0|
| Baldur's Gate 3 ...|                 0|1.694400854E9|
|      good graphics |                 0|          0.0|
|         it's a very|                 0|1.694231613E9|
|   story you explore|                 0|1.69389