<a href="https://colab.research.google.com/github/EmilKJohn99/hello/blob/emilwork/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd


In [33]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import udf
import re


In [19]:
!pip install kaggle



In [20]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [21]:
#fetching dataset from kaggle
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


In [22]:
#extracting zip file

from zipfile import ZipFile
dataset='/content/sentiment140.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print("dataset has been extracted")

dataset has been extracted


In [60]:
from pyspark.sql import SparkSession

# 1. Create Spark session (only once)
spark = SparkSession.builder.appName("TwitterSentiment").getOrCreate()

# 2. Define column names
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']

# 3. Read CSV into PySpark DataFrame
df = spark.read.csv("/content/training.1600000.processed.noemoticon.csv",
                    header=False,       # because the CSV has no header
                    inferSchema=True,   # infer data types
                    encoding="ISO-8859-1")  # for special characters

# 4. Rename columns
df = df.toDF(*column_names)

# 5. Show first few rows
df.show(5)


+------+----------+--------------------+--------+---------------+--------------------+
|target|        id|                date|    flag|           user|                text|
+------+----------+--------------------+--------+---------------+--------------------+
|     0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|     0|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|     0|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|     0|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|     0|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
+------+----------+--------------------+--------+---------------+--------------------+
only showing top 5 rows



In [61]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Word2VecExample") \
    .getOrCreate()


In [63]:
from pyspark.sql.functions import col, lower, regexp_replace, split

# 1. Convert text to lowercase and remove non-alphabetic characters
df = df.withColumn("clean_text", regexp_replace(lower(col("text")), r"[^a-z\s]", ""))

# 2. Tokenize (split into words)
df = df.withColumn("tokens", split(col("clean_text"), " "))


In [65]:
spark_df = df.select("tokens")


In [67]:
spark_df.select("tokens").show(10, truncate=False)


+--------------------------------------------------------------------------------------------------------------------------------+
|tokens                                                                                                                          |
+--------------------------------------------------------------------------------------------------------------------------------+
|[switchfoot, httptwitpiccomyzl, , awww, thats, a, bummer, , you, shoulda, got, david, carr, of, third, day, to, do, it, d]      |
|[is, upset, that, he, cant, update, his, facebook, by, texting, it, and, might, cry, as, a, result, , school, today, also, blah]|
|[kenichan, i, dived, many, times, for, the, ball, managed, to, save, , , the, rest, go, out, of, bounds]                        |
|[my, whole, body, feels, itchy, and, like, its, on, fire, ]                                                                     |
|[nationwideclass, no, its, not, behaving, at, all, im, mad, why, am, i, here, beca

In [68]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
filtered_df = remover.transform(spark_df)


In [69]:
filtered_df.show()

+--------------------+--------------------+
|              tokens|     filtered_tokens|
+--------------------+--------------------+
|[switchfoot, http...|[switchfoot, http...|
|[is, upset, that,...|[upset, cant, upd...|
|[kenichan, i, div...|[kenichan, dived,...|
|[my, whole, body,...|[whole, body, fee...|
|[nationwideclass,...|[nationwideclass,...|
|[kwesidei, not, t...|[kwesidei, whole,...|
|    [need, a, hug, ]|       [need, hug, ]|
|[loltrish, hey, ,...|[loltrish, hey, ,...|
|[tatianak, nope, ...|[tatianak, nope, ...|
|[twittera, que, m...|[twittera, que, m...|
|[spring, break, i...|[spring, break, p...|
|[i, just, repierc...| [repierced, ears, ]|
|[caregiving, i, c...|[caregiving, coul...|
|[octolinz, it, it...|[octolinz, counts...|
|[smarrison, i, wo...|[smarrison, would...|
|[iamjazzyfizzle, ...|[iamjazzyfizzle, ...|
|[hollis, death, s...|[hollis, death, s...|
|[about, to, file,...|     [file, taxes, ]|
|[lettya, ahh, ive...|[lettya, ahh, ive...|
|[fakerpattypattz,...|[fakerpatt

In [48]:
filtered_df = filtered_df.drop("tokens")


In [55]:
df.head()


Unnamed: 0,target,id,date,flag,user,text,tokens,filtered_tokens
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,switchfoot httptwitpiccomyzl awww thats a bum...,"[switchfoot, httptwitpiccomyzl, awww, thats, a...",DataFrame[filtered_tokens: array<string>]
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he cant update his facebook by t...,"[is, upset, that, he, cant, update, his, faceb...",DataFrame[filtered_tokens: array<string>]
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,kenichan i dived many times for the ball manag...,"[kenichan, i, dived, many, times, for, the, ba...",DataFrame[filtered_tokens: array<string>]
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...",DataFrame[filtered_tokens: array<string>]
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,nationwideclass no its not behaving at all im ...,"[nationwideclass, no, its, not, behaving, at, ...",DataFrame[filtered_tokens: array<string>]


In [57]:
print(df.columns)


Index(['target', 'id', 'date', 'flag', 'user', 'text', 'tokens',
       'filtered_tokens'],
      dtype='object')


In [58]:
df = df.drop("filtered_tokens", axis=1)


In [70]:
from pyspark.ml.feature import Word2Vec

word2Vec = Word2Vec(vectorSize=100, minCount=5, inputCol="filtered_tokens", outputCol="features")
model = word2Vec.fit(filtered_df)


In [71]:
from pyspark.ml.feature import Word2Vec

word2Vec = Word2Vec(
    vectorSize=100,       # size of word vectors
    windowSize=5,         # context window
    minCount=5,           # ignore words with <5 occurrences
    inputCol="filtered_tokens",  # or "tokens" if you skipped stopwords
    outputCol="features"
)

model = word2Vec.fit(filtered_df)


In [72]:
result = model.transform(filtered_df)


In [99]:
from pyspark.sql.functions import monotonically_increasing_id

# Add index columns
df_with_index = df.withColumn("idx", monotonically_increasing_id())
result_with_index = result.withColumn("idx", monotonically_increasing_id())

# Join on the index
final_df = result_with_index.join(df_with_index.select("idx", "target"), on="idx").drop("idx")


In [100]:
train_data, test_data = final_df.randomSplit([0.8, 0.2], seed=42)



In [101]:
result.show()

+--------------------+--------------------+--------------------+
|              tokens|     filtered_tokens|            features|
+--------------------+--------------------+--------------------+
|[switchfoot, http...|[switchfoot, http...|[-0.0537012548052...|
|[is, upset, that,...|[upset, cant, upd...|[-0.0713070921886...|
|[kenichan, i, div...|[kenichan, dived,...|[0.00996953621506...|
|[my, whole, body,...|[whole, body, fee...|[0.04063687446926...|
|[nationwideclass,...|[nationwideclass,...|[0.00735034900052...|
|[kwesidei, not, t...|[kwesidei, whole,...|[-0.0329757235012...|
|    [need, a, hug, ]|       [need, hug, ]|[-0.0544717479497...|
|[loltrish, hey, ,...|[loltrish, hey, ,...|[0.04788446867544...|
|[tatianak, nope, ...|[tatianak, nope, ...|[-0.0052693709731...|
|[twittera, que, m...|[twittera, que, m...|[-0.0633812626823...|
|[spring, break, i...|[spring, break, p...|[0.06802074611186...|
|[i, just, repierc...| [repierced, ears, ]|[-0.0128068284442...|
|[caregiving, i, c...|[ca

In [102]:
final_df.select("features", "target").show()


+--------------------+------+
|            features|target|
+--------------------+------+
|[-0.0537012548052...|     0|
|[-0.0286609143950...|     0|
|[0.11045893765985...|     0|
|[-0.0600941821134...|     0|
|[0.02250165287405...|     0|
|[-0.0544717479497...|     0|
|[-0.1210425070642...|     0|
|[-0.0280522913672...|     0|
|[0.05665780078958...|     0|
|[-0.0034547784365...|     0|
|[0.07804573373869...|     0|
|[0.02062398828566...|     0|
|[0.01291885289053...|     0|
|[-0.0228574319432...|     0|
|[0.03906210251152...|     0|
|[0.00917887756306...|     0|
|[0.01761707228918...|     0|
|[0.02492280041968...|     0|
|[0.00866161107551...|     0|
|[-0.0127031747251...|     0|
+--------------------+------+
only showing top 20 rows



In [94]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol="target")
model = lr.fit(train_data)

In [103]:
predictions = classifier.transform(test_data)
predictions.select("prediction", "target").show(10)


+----------+------+
|prediction|target|
+----------+------+
|       4.0|     0|
|       4.0|     4|
|       4.0|     0|
|       0.0|     0|
|       4.0|     0|
|       4.0|     4|
|       0.0|     0|
|       0.0|     0|
|       0.0|     0|
|       4.0|     4|
+----------+------+
only showing top 10 rows



In [104]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="target",
    predictionCol="prediction",
    metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)
print(f"Train Accuracy = {accuracy:.4f}")

Test Accuracy = 0.7125


Test Accuracy: 0.7125


In [98]:
test_data.columns


['tokens', 'filtered_tokens', 'target']

In [92]:
test_data = test_data.drop("features")


