<a href="https://colab.research.google.com/github/EmilKJohn99/hello/blob/main/gbtsentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [2]:
#fetching dataset from kaggle
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 91% 74.0M/80.9M [00:00<00:00, 772MB/s]
100% 80.9M/80.9M [00:00<00:00, 749MB/s]


In [3]:
#extracting zip file

from zipfile import ZipFile
dataset='/content/sentiment140.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print("dataset has been extracted")

dataset has been extracted


In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, split, when, size
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [5]:
spark = SparkSession.builder.appName("TwitterSentimentImproved").getOrCreate()

In [6]:
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
df = spark.read.csv("/content/training.1600000.processed.noemoticon.csv",
                    header=False, inferSchema=True, encoding="ISO-8859-1").toDF(*column_names)


In [7]:
# 3. Binary label: convert 4 to 1 (positive), 0 stays 0 (negative)
df = df.withColumn("label", when(col("target") == 4, 1).otherwise(0))

In [8]:
df = df.withColumn("clean_text", regexp_replace(lower(col("text")), r"[^a-z\s]", ""))
df = df.withColumn("tokens", split(col("clean_text"), "\s+"))


In [9]:
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
df = remover.transform(df)

In [10]:
df = df.withColumn("filtered_tokens", col("filtered_tokens"))
df = df.filter(size(col("filtered_tokens")) > 0)

In [11]:
hashingTF = HashingTF(inputCol="filtered_tokens", outputCol="rawFeatures", numFeatures=10000)
tf_df = hashingTF.transform(df)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idf_model = idf.fit(tf_df)
tfidf_df = idf_model.transform(tf_df)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
train_data, test_data = tfidf_df.randomSplit([0.8, 0.2], seed=42)



In [None]:
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)
rf_model = rf.fit(train_data)

In [None]:
predictions = rf_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Random Forest Accuracy (TF-IDF): {accuracy:.4f}")

Random Forest Accuracy (TF-IDF): 0.6960


In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, split
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

In [13]:
spark = SparkSession.builder.appName("TwitterSentimentImproved").getOrCreate()

In [14]:
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
df = spark.read.csv("/content/training.1600000.processed.noemoticon.csv",
                    header=False, inferSchema=True, encoding="ISO-8859-1")
df = df.toDF(*column_names)

In [15]:
# Keep only 0 (negative) and 4 (positive) sentiment tweets
df = df.filter((col("target") == 0) | (col("target") == 4))

# Map 4 → 1 to make it binary classification
from pyspark.sql.functions import when

df = df.withColumn("target", when(col("target") == 4, 1).otherwise(0))



In [16]:
df = df.withColumn("text", lower(col("text")))
df = df.withColumn("text", regexp_replace(col("text"), r"http\S+|www\S+|@\w+|#\w+", ""))
df = df.withColumn("text", regexp_replace(col("text"), r"[^a-z\s]", ""))
df = df.withColumn("text", regexp_replace(col("text"), r"\s+", " "))
df = df.withColumn("tokens", split(col("text"), " "))

In [17]:
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")

In [18]:
hashingTF = HashingTF(inputCol="filtered_tokens", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features")


In [19]:
gbt = GBTClassifier(featuresCol="features", labelCol="target", maxIter=50)

In [20]:
pipeline = Pipeline(stages=[remover, hashingTF, idf, gbt])

In [21]:
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)


In [None]:
model = pipeline.fit(train_data)

In [None]:
predictions = model.transform(test_data)
predictions.select("prediction", "target").show(10)

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Improved Accuracy = {accuracy:.4f}")