In [15]:
import numpy as np              #For handling arrays
import pandas as pd             # For handling data
import os
import shutil

import matplotlib.pyplot as plt

%matplotlib inline

#from tensorflow.keras.preprocessing.image import ImageDataGenerator
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Dense, Conv2D, Flatten, MaxPooling2D
#from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
#from tensorflow.keras.utils import plot_model
#from sklearn.utils.class_weight import compute_class_weight

import warnings
warnings.filterwarnings('ignore')

In [16]:
# Path to data set
csv_file = "file:///home/hduser/Downloads/work2/ProjectTweets.csv"

# Data Understanding

In [17]:
#Import pyspark SQL
from pyspark.sql import SparkSession        

# Create a SparkSession
spark = (SparkSession
  .builder
  .appName("SparkSQL")
  .getOrCreate())

# Read and create a temporary view
# The dataset doesnt contain header, so header = false
# toDF to define appropriated column name
dfTwitter = (spark.read.format("csv")
  .option("inferSchema", "true")
  .option("header", "false")
  .load(csv_file)
  .toDF('id', 'seq', 'date', 'query', 'user', 'tweet'))

dfTwitter.createOrReplaceTempView("tblTempTwitter")


                                                                                

In [18]:
spark.sql("""SELECT * FROM tblTempTwitter""").show(10)

+---+----------+--------------------+--------+---------------+--------------------+
| id|       seq|                date|   query|           user|               tweet|
+---+----------+--------------------+--------+---------------+--------------------+
|  0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|  1|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|  2|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|  3|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|  4|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
|  5|1467811372|Mon Apr 06 22:20:...|NO_QUERY|       joy_wolf|@Kwesidei not the...|
|  6|1467811592|Mon Apr 06 22:20:...|NO_QUERY|        mybirch|         Need a hug |
|  7|1467811594|Mon Apr 06 22:20:...|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|  8|1467811795|Mon Apr 06 22:20:...|NO_QUERY|2Hood4Hollywood|@Tatiana_K nop

In [19]:
spark.sql("""SELECT * FROM tblTempTwitter where query != 'NO_QUERY'""").show(10)

[Stage 5:>                                                          (0 + 1) / 1]

+---+---+----+-----+----+-----+
| id|seq|date|query|user|tweet|
+---+---+----+-----+----+-----+
+---+---+----+-----+----+-----+



                                                                                

In [20]:
spark.sql("""SELECT user, COUNT(user) as total FROM tblTempTwitter GROUP BY user ORDER BY total desc;""").show(20)



[Stage 8:>                                                          (0 + 6) / 6]

+---------------+-----+
|           user|total|
+---------------+-----+
|       lost_dog|  549|
|        webwoke|  345|
|       tweetpet|  310|
|SallytheShizzle|  281|
|    VioletsCRUK|  279|
|    mcraddictal|  276|
|       tsarnick|  248|
|    what_bugs_u|  246|
|    Karen230683|  238|
|      DarkPiano|  236|
|   SongoftheOss|  227|
|      Jayme1988|  225|
|         keza34|  219|
| ramdomthoughts|  216|
|      shanajaca|  213|
|         wowlew|  212|
|     nuttychris|  211|
|   TraceyHewins|  211|
|   thisgoeshere|  207|
|     Spidersamm|  205|
+---------------+-----+
only showing top 20 rows



                                                                                

Looking for null or blank date values

In [21]:
spark.sql("""SELECT user, tweet FROM tblTempTwitter where date is null or date =='';""").show(20)


[Stage 11:>                                                         (0 + 1) / 1]

+----+-----+
|user|tweet|
+----+-----+
+----+-----+



                                                                                

# Data Preparation

### SPARK HIVE


Creating a Database in Hive Metastore 

In [22]:
#Create a Database dbTwitter in Hive
spark.sql("CREATE DATABASE IF NOT EXISTS dbTwitter")



2023-10-18 09:50:42,644 WARN conf.HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
2023-10-18 09:50:42,646 WARN conf.HiveConf: HiveConf of name hive.stats.retries.wait does not exist
2023-10-18 09:50:46,798 WARN metastore.ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
2023-10-18 09:50:46,798 WARN metastore.ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore hduser@127.0.1.1
2023-10-18 09:50:47,225 WARN metastore.ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
2023-10-18 09:50:47,242 ERROR metastore.RetryingHMSHandler: AlreadyExistsException(message:Database dbtwitter already exists)
	at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.create_database(HiveMetaStore.java:925)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeM

DataFrame[]

Using spark.sql() method "CREATE TABLE" to create a table in Hive from the spark temporary view tblTempTwitter.

In [23]:
#Create a Table in Hive tblTwitter on the bdTwitter database.
spark.sql("CREATE TABLE IF NOT EXISTS dbTwitter.tblTwitter (id Int, seq Double, date String, query String, user String, tweet String)")



2023-10-18 09:50:55,618 WARN analysis.ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.
2023-10-18 09:50:56,236 WARN session.SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
2023-10-18 09:50:56,471 WARN conf.HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
2023-10-18 09:50:56,471 WARN conf.HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
2023-10-18 09:50:56,472 WARN conf.HiveConf: HiveConf of name hive.stats.retries.wait does not exist
2023-10-18 09:50:56,529 ERROR metastore.RetryingHMSHandler: AlreadyExistsException(message:Table tbltwitter already exists)
	at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.create_table_core(HiveMetaStore.java:1416)
	at org.apach

DataFrame[]

Inserting data from the spark temporary view tblTempTwitter into the Hive table tblTwitter:

In [25]:
#Insert into Hive tblTwitter using the spar temp view tblTempTwitter. 
spark.sql("INSERT INTO TABLE dbTwitter.tblTwitter SELECT * FROM tblTempTwitter")



                                                                                

DataFrame[]

In [26]:
#Lets view the data in the hive table
spark.sql("SELECT * FROM dbTwitter.tblTwitter").show(10)

+------+-------------+--------------------+--------+---------------+--------------------+
|    id|          seq|                date|   query|           user|               tweet|
+------+-------------+--------------------+--------+---------------+--------------------+
|545133|2.201337104E9|Tue Jun 16 20:08:...|NO_QUERY|      alt_ducky|@miss_clariss oh ...|
|545134|2.201337108E9|Tue Jun 16 20:08:...|NO_QUERY|     CourtneyVR|Failed my WOF. Wi...|
|545135|2.201337287E9|Tue Jun 16 20:08:...|NO_QUERY|    melissaholt|Watching the firs...|
|545136|2.201337425E9|Tue Jun 16 20:08:...|NO_QUERY|       itznesha|my computer is in...|
|545137|2.201337512E9|Tue Jun 16 20:08:...|NO_QUERY|    lovinmyboys|Worked out my upp...|
|545138|2.201337757E9|Tue Jun 16 20:08:...|NO_QUERY|     mikerbrant|OMG I got my new ...|
|545139|2.201338077E9|Tue Jun 16 20:08:...|NO_QUERY|         daulex|my back has flare...|
|545140|2.201338113E9|Tue Jun 16 20:08:...|NO_QUERY|    CaliHeather|I am starting to ...|
|545141|2.

### MySQL

In [27]:
# Imports
from pyspark.sql import SparkSession

sparkMySQL = (SparkSession
  .builder
  .appName("SparkMySQL")
  .config("spark.jars", "mysql-connector-java-8.1.0.jar")
  .getOrCreate())

#spark = SparkSession.builder \
#  .appName("MyApp") \
#  .config("spark.jars", "mysql-connector-java-8.0.27.jar") \
#  .config("spark.driver.extraClassPath", "mysql-connector-java-8.0.27.jar") \
#  .getOrCreate()


In [28]:
#pip install ipython-sql

In [29]:
#pip install mysqlclient

In [30]:
#pip install mysql-connector-python

In [31]:
#pip install pymysql

In [32]:
import mysql.connector


In [33]:
#PLEASE, RUN THE COMMANDS BELOW ON TERMINAL TO CREATE AND GRANT PERMISSIONS TO user1:

#mysql -u root -p

#CREATE USER 'user1'@'%%' IDENTIFIED BY 'Pass@word1';
#GRANT ALL PRIVILEGES ON * . * TO 'Pass@word1'@'%%';
#FLUSH PRIVILEGES;

In [34]:

db_connection = mysql.connector.connect(user="user1", password="Pass@word1")
db_cursor = db_connection.cursor()
db_cursor.execute("CREATE DATABASE IF NOT EXISTS dbTwitter;")
db_cursor.execute("USE dbTwitter;")



In [35]:
db_cursor.execute("CREATE TABLE IF NOT EXISTS dbTwitter.tblTwitter (id NUMERIC, seq NUMERIC, date VARCHAR(50), query VARCHAR(50), user VARCHAR(50), tweet TEXT)")


In [39]:
dfPandasTwitter = dfTwitter.toPandas()

                                                                                

In [52]:
# Import pymysql module
import pymysql

In [53]:
# Connect to the database
connection = pymysql.connect(host = 'localhost',
                             user = 'user1',
                             password = 'Pass@word1',
                             db = 'dbTwitter')

In [54]:
cursor = connection.cursor()

In [None]:
# creating column list for insertion
cols = ",".join([str(i) for i in dfPandasTwitter.columns.tolist()])

# Insert DataFrame recrds one by one.
for i, row in dfPandasTwitter.iterrows():
    sql = "INSERT INTO tblTwitter (" + cols + ") VALUES (" + "%s," * (len(row) - 1) + "%s)"
    cursor.execute(sql, tuple(row))

    # the connection is not autocommitted by default, so we must commit to save our changes
    connection.commit()

In [None]:
db_cursor.execute('SELECT count(*) as total FROM dbTwitter.tblTwitter')

table_rows = db_cursor.fetchall()

df = pd.DataFrame(table_rows)

In [None]:
pip install vaderSentiment

 - a positive sentiment, compound ≥ 0.05.
 - a negative sentiment, compound ≤ -0.05.
 - a neutral sentiment, the compound is between ]-0.05, 0.05[


In [None]:
# Load vaderSentimental library
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Create and initialise an object
sentiment = SentimentIntensityAnalyzer()


In [None]:

df_Sentimental_Analysis = pd.read_json('reddit_comments.json')

df_Sentimental_Analysis['compound'] = ''
df_Sentimental_Analysis['sentiment'] = ''

positive = 0
negative = 0
neutral = 0

#Iterate on our dataset to change all the values with the new porcentages
for i, row in df_Sentimental_Analysis.iterrows():
    
    text = row["comment"]
    sent = sentiment.polarity_scores(text)
    #print("Sentiment of " + str(i) + ":", sent['compound'])
    df_Sentimental_Analysis.at[i,'compound'] = sent['compound']
    if float(sent['compound']) >= 0.05:
        df_Sentimental_Analysis.at[i,'sentiment'] = 'positive'
        positive += 1
    elif float(sent['compound']) <= -0.05:
        df_Sentimental_Analysis.at[i,'sentiment'] = 'negative'
        negative += 1
    else:
        df_Sentimental_Analysis.at[i,'sentiment'] = 'neutral'
        neutral += 1

print("Total positive feedbacks: " + str(positive))
print("Total negative feedbacks: " + str(negative))
print("Total neutral feedbacks: " + str(neutral))

#df_Sentimental_Analysis

pd.options.display.max_colwidth = 1000

df_Sentimental_Analysis.loc[df_Sentimental_Analysis['sentiment'] == 'positive']