In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [2]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("pipeline_twitter4") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/02 15:05:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df_twitter = spark.read.csv("/home/ayoubbakkali/Desktop/Sentiment_Analysis/twitter_training.csv", header=False, inferSchema=True)

In [4]:
# Provide column names manually (replace with actual column names)
columns = ["Tweet ID", "Entity", "Sentiment", "Tweet content"]
df_twitter = df_twitter.toDF(*columns)

In [5]:
df_twitter = df_twitter.drop("Tweet ID")

In [6]:
df_twitter = df_twitter.dropna(subset=["Tweet content"])

In [7]:
# Import required modules
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import re

# Define the clean_and_lowercase function
def clean_and_lowercase(text):
    # Convert the text to lowercase
    text_lower = text.lower()
    # Remove special characters, punctuation, and unnecessary symbols
    cleaned_text = re.sub(r'[^\w\s]', '', text_lower)
    # Return the cleaned text
    return cleaned_text

# Define the UDF
clean_and_lowercase_udf = udf(clean_and_lowercase, StringType())

# Apply the UDF to the 'Tweet content' column
df_twitter = df_twitter.withColumn("cleaned_tweet", clean_and_lowercase_udf("Tweet content"))

In [8]:
df_twitter

DataFrame[Entity: string, Sentiment: string, Tweet content: string, cleaned_tweet: string]

In [9]:
# Création des étapes de prétraitement
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
indexer = StringIndexer(inputCol="Sentiment", outputCol="label")
tokenizer = Tokenizer(inputCol="cleaned_tweet", outputCol="tokens")
stop_words_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tweet")
cv = CountVectorizer(inputCol="filtered_tweet", outputCol="raw_features")
idf = IDF(inputCol="raw_features", outputCol="features")



In [10]:
from pyspark.ml import Pipeline
# Add indexer, lemmatization, and the rest of the pipeline stages
data_preprocessing_pipeline = Pipeline(stages=[indexer, tokenizer, stop_words_remover, cv, idf])

In [11]:
preprocessing_model = data_preprocessing_pipeline.fit(df_twitter)

                                                                                

In [12]:
df_transformed = preprocessing_model.transform(df_twitter)

In [13]:
# Specify the path where you want to save the model
model_path = "preprocessing_pipeline1"
# Save the preprocessing model
preprocessing_model.save(model_path)

In [14]:
# Split the data into train and test sets
train_data, test_data = df_transformed.randomSplit([0.8, 0.2], seed=42)

In [15]:
train_data

DataFrame[Entity: string, Sentiment: string, Tweet content: string, cleaned_tweet: string, label: double, tokens: array<string>, filtered_tweet: array<string>, raw_features: vector, features: vector]

In [16]:
from pyspark.ml.classification import OneVsRest
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Create a LinearSVC object
svm = LinearSVC(maxIter=10, regParam=0.1, featuresCol="features", labelCol="label")

# Create an OneVsRest object
ovr = OneVsRest(classifier=svm)

# Train the OneVsRest model
ovr_model = ovr.fit(train_data)

# Make predictions on the test data
predictions = ovr_model.transform(test_data)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy SVM:", accuracy)

24/05/02 15:07:47 WARN DAGScheduler: Broadcasting large task binary with size 1216.1 KiB
24/05/02 15:07:48 WARN DAGScheduler: Broadcasting large task binary with size 1216.8 KiB
24/05/02 15:07:50 WARN DAGScheduler: Broadcasting large task binary with size 1251.5 KiB
24/05/02 15:07:50 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/05/02 15:07:50 WARN DAGScheduler: Broadcasting large task binary with size 1252.2 KiB
24/05/02 15:07:51 WARN DAGScheduler: Broadcasting large task binary with size 1252.2 KiB
24/05/02 15:07:51 WARN DAGScheduler: Broadcasting large task binary with size 1252.2 KiB
24/05/02 15:07:51 WARN DAGScheduler: Broadcasting large task binary with size 1252.2 KiB
24/05/02 15:07:52 WARN DAGScheduler: Broadcasting large task binary with size 1252.2 KiB
24/05/02 15:07:52 WARN DAGScheduler: Broadcasting large task binary with size 1252.2 KiB
24/05/02 15:07:52 WARN DAGScheduler: Broadcasting large task binary with size 1252.2 KiB


Accuracy SVM: 0.8504717626145221


                                                                                