# Scrapping Data

---



In [None]:
!pip install snscrape
import snscrape.modules.twitter as sntwitter
import pandas as pd

In [None]:
# Scrape data about ai and neuroscience
query = "ai and neuroscience"
tweets = []
limit = 50000


for tweet in sntwitter.TwitterSearchScraper(query).get_items():

    if len(tweets) == limit:
        break
    else:
        tweets.append([tweet.content])
        
df = pd.DataFrame(tweets, columns=['tweet'])
print(df)

# to save to csv
df.to_csv('ai_neuroscience_tweets.csv')



---



---



---



In [None]:
!pip install spark 
!pip install findspark 
!pip install pyspark
!pip install textblob


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Libaraies

---



In [None]:
import findspark
findspark.init()
import pyspark as ps
import warnings
from pyspark.sql import SQLContext
from pyspark.sql.types import DoubleType
from textblob import TextBlob
from pyspark.sql.functions import udf , col,lit

# Read Data 

---



In [None]:
try:
    # create SparkContext on all CPUs available: in my case I have 4 CPUs on my laptop
    sc = ps.SparkContext('local[4]')
    sqlContext = SQLContext(sc)
    print("Just created a SparkContext")
except ValueError:
    warnings.warn("SparkContext already exists in this scope")

print(sc.master)
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('/content/ai_neuroscience_tweets.csv')   
print(type(df))
print("data count " ,df.count())
print(df.show())



Just created a SparkContext
local[4]
<class 'pyspark.sql.dataframe.DataFrame'>
data count  36106
+--------------------+--------------------+
|                 _c0|               tweet|
+--------------------+--------------------+
|                   0|@cryptoworld202 Y...|
|After years of bu...| #Matrix is initi...|
|Data from the bra...| stored and proce...|
|                   1|@MatrixAINetwork ...|
|#AI #metaverse #w...|                null|
|                   2|@Metathea11 Yeah ...|
|I studied neurosc...|   I love philosophy|
|Which is why it’s...|                null|
|                   3|✨Safeguarding mem...|
|#OasisNetwork and...|                null|
|Privacy and confi...|                null|
|Control your data...|                null|
|                   4|Interested in lea...|
|https://t.co/EkIC...|                null|
|                   5|@PessoaBrain Lear...|
|                   6|Stanford's @stanf...|
|Information and a...|                null|
|                   7|@

# Check Null 

---



In [None]:
df = df.select(df["tweet"])
print("Count Null values in data " , df.filter(df.tweet.isNull()).count())
data_withoutnull = df.filter(df.tweet.isNotNull()).dropna()
print("Count Null values in data after removing null  " , data_withoutnull.filter(data_withoutnull.tweet.isNull()).count())
print("Count data after removing null  " , data_withoutnull.count())

Count Null values in data  11530
Count Null values in data after removing null   0
Count data after removing null   24576


#Duplicates

In [None]:
print("count of data before removing duplications :" , data_withoutnull.count())
print("Distinct count: "+str(data_withoutnull.distinct().count()))
#Remove Duplicates 
tweets_distinct = data_withoutnull.dropDuplicates()
print("count of data after removing duplications : "+str(tweets_distinct.count()))

count of data before removing duplications : 24576
Distinct count: 23003
count of data after removing duplications : 23003


# Label data 

---



In [None]:
# Function to get sentiment
def apply_blob(sentence):
    temp = TextBlob(sentence).sentiment[0]
    if temp == 0.0:
        return 0.0 # Neutral
    elif temp >= 0.0:
        return 1.0 # Positive
    else:
        return 2.0 # Negative

# UDF to write sentiment on DF
sentiment = udf(apply_blob, DoubleType())

data = tweets_distinct.select(lit( sentiment(tweets_distinct['tweet'])).alias("label"), "*")
data.show(5)

+-----+--------------------+
|label|               tweet|
+-----+--------------------+
|  1.0|@cz_binance 🔹 Ma...|
|  1.0|As a layman, the ...|
|  0.0|📣MAIN2022 call f...|
|  0.0|I think @DavidAMa...|
|  1.0|@gershbrain We ke...|
+-----+--------------------+
only showing top 5 rows



In [None]:
# Other CSV options
data.write.options(header='True', delimiter=',').csv("preprocessing_twitter_Scrapping_data_")