In [None]:
import time

start_time = time.time()

In [None]:
!pip install --upgrade pyspark==3.4.1 spark-nlp==5.4.0


Collecting pyspark==3.4.1
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting spark-nlp==5.4.0
  Downloading spark_nlp-5.4.0-py2.py3-none-any.whl.metadata (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting py4j==0.10.9.7 (from pyspark==3.4.1)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading spark_nlp-5.4.0-py2.py3-none-any.whl (579 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m579.2/579.2 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pys

In [None]:
# ================================
# Spark NLP Initialization
# ================================
try:
    import sparknlp
    spark = sparknlp.start()
    print("✅ Spark NLP started successfully")
except Exception as e:
    raise RuntimeError(f"❌ Spark NLP start failed: {e}")


✅ Spark NLP started successfully


In [None]:
# ================================
# Imports
# ================================
try:
    from sparknlp.base import DocumentAssembler
    from sparknlp.annotator import (
        Tokenizer,
        UniversalSentenceEncoder,
        SentimentDLModel
    )
    from pyspark.ml import Pipeline
    from pyspark.sql.functions import col, current_timestamp
    print("✅ Imports successful")
except Exception as e:
    raise ImportError(f"❌ Import error: {e}")


✅ Imports successful


In [None]:
df = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv("/content/sliver_layer__1_ (1).csv")
)

In [None]:
df.printSchema()


root
 |-- id: double (nullable = true)
 |-- text: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- username: string (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- language: string (nullable = true)
 |-- retweet_count: integer (nullable = true)
 |-- like_count: integer (nullable = true)
 |-- reply_count: integer (nullable = true)
 |-- quote_count: integer (nullable = true)
 |-- impression_count: integer (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- mentions: string (nullable = true)
 |-- source: string (nullable = true)
 |-- is_retweet: boolean (nullable = true)
 |-- is_reply: integer (nullable = true)
 |-- in_reply_to_user_id: integer (nullable = true)
 |-- conversation_id: double (nullable = true)
 |-- user_followers_count: integer (nullable = true)
 |-- user_following_count: integer (nullable = true)
 |-- user_verified: boolean (nullable = true)
 |-- user_location: string (nullable = true)
 |-- possibly_sensitive: boolean (nulla

In [None]:
from pyspark.sql.functions import col

# Step 1: Ensure correct column name for ML
if "cleaned_text" not in df.columns and "clean_text" in df.columns:
    df = df.withColumnRenamed("clean_text", "cleaned_text")

# Step 2: Define required column
REQUIRED_COL = "cleaned_text"

# Step 3: Validation function
def validate_input_data(df):
    if REQUIRED_COL not in df.columns:
        raise ValueError(f"❌ Missing required column: {REQUIRED_COL}")

    if df.count() == 0:
        raise ValueError("❌ Input DataFrame is empty")

    null_count = df.filter(col(REQUIRED_COL).isNull()).count()
    if null_count > 0:
        print(f"⚠️ Warning: {null_count} null rows removed")

    return df.filter(col(REQUIRED_COL).isNotNull())

# Step 4: Apply validation
df = validate_input_data(df)

# Step 5: Quick verification
df.select("cleaned_text").show(5, truncate=False)


+-------------------------------------------------------------------------------------------------------------------------------------------------------+
|cleaned_text                                                                                                                                           |
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
|agent every development say quality throughout beautiful databreach                                                                                    |
|night respond red information last everything cve blakeerik                                                                                            |
|here grow gas enough analysis least by infosec cybersecurity mfa                                                                                       |
|product significant world talk term herself player half have decide environ

In [None]:
# ================================
# NLP Components
# ================================
try:
    document_assembler = DocumentAssembler() \
        .setInputCol("clean_text") \
        .setOutputCol("document")

    tokenizer = Tokenizer() \
        .setInputCols(["document"]) \
        .setOutputCol("token")

    embeddings = UniversalSentenceEncoder.pretrained(
        "tfhub_use", "en"
    ).setInputCols(["document"]) \
     .setOutputCol("embeddings")

    sentiment_model = SentimentDLModel.pretrained(
        "sentimentdl_use_twitter", "en"
    ).setInputCols(["embeddings"]) \
     .setOutputCol("sentiment")

    print("✅ NLP components initialized")

except Exception as e:
    raise RuntimeError(f"❌ NLP component error: {e}")


tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.4 MB
[OK!]
✅ NLP components initialized


In [None]:
# ================================
# Build Pipeline
# ================================
try:
    pipeline = Pipeline(stages=[
        document_assembler,
        tokenizer,
        embeddings,
        sentiment_model
    ])

    print("✅ Pipeline built successfully")

except Exception as e:
    raise RuntimeError(f"❌ Pipeline creation failed: {e}")


✅ Pipeline built successfully


In [None]:
df = df.withColumnRenamed("cleaned_text", "clean_text")


In [None]:
# ================================
# Run Prediction
# ================================
try:
    model = pipeline.fit(df)
    prediction_df = model.transform(df)

    print("✅ Sentiment prediction completed")

except Exception as e:
    raise RuntimeError(f"❌ Prediction failed: {e}")


✅ Sentiment prediction completed


In [None]:
# ================================
# Extract Sentiment Label
# ================================
try:
    final_df = (
        prediction_df
        .withColumn("sentiment_label", col("sentiment")[0]["result"])
        .withColumn("_prediction_timestamp", current_timestamp())
    )

    final_df.select(
        "clean_text",
        "sentiment_label"
    ).show(10, truncate=False)

    print("✅ Sentiment label extracted")

except Exception as e:
    raise RuntimeError(f"❌ Sentiment extraction failed: {e}")


+-------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+
|clean_text                                                                                                                                             |sentiment_label|
+-------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+
|agent every development say quality throughout beautiful databreach                                                                                    |positive       |
|night respond red information last everything cve blakeerik                                                                                            |positive       |
|here grow gas enough analysis least by infosec cybersecurity mfa                                                                                     

In [None]:
final_df.printSchema()

root
 |-- id: double (nullable = true)
 |-- text: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- username: string (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- language: string (nullable = true)
 |-- retweet_count: integer (nullable = true)
 |-- like_count: integer (nullable = true)
 |-- reply_count: integer (nullable = true)
 |-- quote_count: integer (nullable = true)
 |-- impression_count: integer (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- mentions: string (nullable = true)
 |-- source: string (nullable = true)
 |-- is_retweet: boolean (nullable = true)
 |-- is_reply: integer (nullable = true)
 |-- in_reply_to_user_id: integer (nullable = true)
 |-- conversation_id: double (nullable = true)
 |-- user_followers_count: integer (nullable = true)
 |-- user_following_count: integer (nullable = true)
 |-- user_verified: boolean (nullable = true)
 |-- user_location: string (nullable = true)
 |-- possibly_sensitive: boolean (nulla

In [None]:
# ================================
# Write CSV safely (Colab)
# ================================
try:
    final_df \
        .select("clean_text", "sentiment_label") \
        .coalesce(1) \
        .write \
        .mode("overwrite") \
        .option("header", "true") \
        .csv("/content/final_ml_predictions")

    print("✅ CSV written successfully")

except Exception as e:
    raise RuntimeError(f"❌ CSV write failed: {e}")


✅ CSV written successfully


In [None]:
final_df.count()

503456

In [None]:
end_time = time.time()

print(f"Total runtime: {round(end_time - start_time, 2)} seconds")

Total runtime: 1490.08 seconds
