In [1]:
import tweepy
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim
from datetime import datetime, timedelta, timezone

Authenticate with your Twitter API credentials

-----------------------------------

In [None]:
consumer_key = 'consumer_key'
consumer_secret = 'consumer_secret'
access_token = 'access_token'
access_token_secret = 'access_token_secret'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

# Create API object
api = tweepy.API(auth)

In [None]:
# Define search parameters
keywords = ['pollution']
exclude_retweets = True
tweets = []

# UK coordinates (latitude, longitude) and radius
uk_latitude = 54.3781
uk_longitude = -2.9376
radius = "1000km"

# Calculate date range
today = datetime.now()
one_year_ago = today - timedelta(days=365)

for keyword in keywords:
    query = f'{keyword} -filter:retweets since:{one_year_ago.strftime("%Y-%m-%d")} until:{today.strftime("%Y-%m-%d")}'
    retrieved_tweets = tweepy.Cursor(api.search_tweets, q=query, tweet_mode='extended', geocode=f"{uk_latitude},{uk_longitude},{radius}").items()
    tweets.extend(retrieved_tweets)


----------------------

pulling data from some twitter handles

-----------------------

In [None]:
# Define search parameters
twitter_handle = '@madlendavies'  # Replace with the desired Twitter handle
num_tweets = 1000

# Define search querysewateruk
keywords = ['pollution', 'waste', 'water']
search_query = ' OR '.join(keywords)

# Define start and end dates
start_date = datetime(2021, 1, 1, 0, 0, 0, tzinfo=timezone.utc)
end_date = datetime(2023, 12, 31, 23, 59, 59, tzinfo=timezone.utc)

# Retrieve tweets
tweets = []
for tweet in tweepy.Cursor(api.search_tweets, q=search_query, tweet_mode='extended', lang='en').items(num_tweets):
    if not tweet.retweeted and start_date <= tweet.user.created_at <= end_date:
        tweets.append(tweet)


------------------------

structuring the unstructured data returned from the twitter api

--------------------

In [4]:
dataset = []

for tweet in tweets:
    tweet_dict = {
        'created_at': tweet.created_at,
        'id': tweet.id,
        'text': tweet.full_text,
        'source': tweet.source,
        'truncated': tweet.truncated,
        'in_reply_to_status_id': tweet.in_reply_to_status_id,
        'in_reply_to_user_id': tweet.in_reply_to_user_id,
        'in_reply_to_screen_name': tweet.in_reply_to_screen_name,
        'user_id': tweet.user.id,
        'user_name': tweet.user.name,
        'user_screen_name': tweet.user.screen_name,
        'user_location': tweet.user.location,
        'user_description': tweet.user.description,
        'user_followers_count': tweet.user.followers_count,
        'user_friends_count': tweet.user.friends_count,
        'user_listed_count': tweet.user.listed_count,
        'user_favourites_count': tweet.user.favourites_count,
        'user_statuses_count': tweet.user.statuses_count,
        'user_created_at': tweet.user.created_at,
#         'coordinates': tweet.coordinates.coordinates if tweet.coordinates else None,
        'place': tweet.place.full_name if tweet.place else None,
        'is_quote_status': tweet.is_quote_status,
        'favorite_count': tweet.favorite_count,
        'lang': tweet.lang,
        'image_url': tweet.entities['media'][0]['media_url_https'] if 'media' in tweet.entities and tweet.entities['media'][0]['type'] == 'photo' else None
    }
    dataset.append(tweet_dict)

df = pd.DataFrame(dataset)


-----------------------------

combining the dataset from different handles together

--------------------------------

In [31]:
df = pd.read_csv('HOD/AnglianWater.csv')
df1 = pd.read_csv('HOD/nwater_care.csv')
df2 = pd.read_csv('HOD/DefraGovUK.csv')
df3 = pd.read_csv('HOD/DwrCymru.csv')
df4 = pd.read_csv('HOD/H20EU.csv')
df5 = pd.read_csv('HOD/hafrendcymru.csv')
df6 = pd.read_csv('HOD/@JNCC_UK.csv')
df7 = pd.read_csv('HOD/LDNWaterkeeper.csv')
df8 = pd.read_csv('HOD/NorthumbrianH20.csv')
df9 = pd.read_csv('HOD/@theriverstrust.csv')
df10 = pd.read_csv('HOD/sascampaigns.csv')
df11 = pd.read_csv('HOD/sewateruk.csv')
df12 = pd.read_csv('HOD/SouthernWater.csv')
df13 = pd.read_csv('HOD/SouthWestWater.csv')
df14 = pd.read_csv('HOD/thameswater.csv')
df15 = pd.read_csv('HOD/WorldBankWater.csv')
df16 = pd.read_csv('HOD/stwater.csv')
df17 = pd.read_csv('HOD/uw_wwt.csv')
df18 = pd.read_csv('HOD/UK_WIR.csv')
df19 = pd.read_csv('HOD/unitedutilities.csv')
df20 = pd.read_csv('HOD/wessexwater.csv')

In [32]:
data = pd.concat([df,df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12,df13,df14,df15,df16,df17,df18,df19,df20], axis =0)

In [33]:
data.reset_index(drop=True, inplace=True)

In [34]:
data = data[~data.duplicated(subset =['text', 'id'])] # removing duplicated tweets

In [35]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6878 entries, 0 to 7690
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   created_at               6878 non-null   object 
 1   id                       6878 non-null   int64  
 2   text                     6878 non-null   object 
 3   source                   6878 non-null   object 
 4   truncated                6878 non-null   bool   
 5   in_reply_to_status_id    1778 non-null   float64
 6   in_reply_to_user_id      1814 non-null   float64
 7   in_reply_to_screen_name  1814 non-null   object 
 8   user_id                  6878 non-null   int64  
 9   user_name                6877 non-null   object 
 10  user_screen_name         6878 non-null   object 
 11  user_location            3350 non-null   object 
 12  user_description         5249 non-null   object 
 13  user_followers_count     6878 non-null   int64  
 14  user_friends_count      

In [36]:
dfs = pd.read_csv('/home/c4leb/enomfon/HOD/tweet_UK/tweet_UK4k.csv')
test = pd.concat((dfs, data), axis = 0)
data = test[~test.duplicated()]
data.shape

(11050, 25)

In [37]:
data.reset_index(drop=True, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11050 entries, 0 to 11049
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   created_at               11050 non-null  object 
 1   id                       11050 non-null  int64  
 2   text                     11050 non-null  object 
 3   source                   11050 non-null  object 
 4   truncated                11050 non-null  bool   
 5   in_reply_to_status_id    3860 non-null   float64
 6   in_reply_to_user_id      3973 non-null   float64
 7   in_reply_to_screen_name  3973 non-null   object 
 8   user_id                  11050 non-null  int64  
 9   user_name                11049 non-null  object 
 10  user_screen_name         11050 non-null  object 
 11  user_location            7499 non-null   object 
 12  user_description         9149 non-null   object 
 13  user_followers_count     11050 non-null  int64  
 14  user_friends_count    

------------------

converting the date column to datetime and filtering the tweets that are created before 2020

-------------------

In [38]:
data['user_created_at'] = pd.to_datetime(data['user_created_at'])
# Check for dates below 2021
mask = data['user_created_at'].dt.year < 2020

# Filter the DataFrame based on the mask
data = data[~mask]

In [39]:
data.lang.unique(), data.shape #we have two languages here, english, french, and undefined

(array(['en', 'fr', 'und'], dtype=object), (7683, 25))

In [40]:
data[data.lang == 'en']

Unnamed: 0,created_at,id,text,source,truncated,in_reply_to_status_id,in_reply_to_user_id,in_reply_to_screen_name,user_id,user_name,...,user_listed_count,user_favourites_count,user_statuses_count,user_created_at,coordinates,place,is_quote_status,favorite_count,lang,image_url
3,2023-06-24 21:16:45+00:00,1672715409433190400,@HothfieldPlace All that pollution what “appar...,Twitter for iPhone,False,1.672699e+18,1.406968e+18,HothfieldPlace,1605229534915301386,we are considerate,...,0,312,704,2022-12-20 15:52:12+00:00,,,False,0,en,
5,2023-06-24 21:08:08+00:00,1672713238255992834,@PetenShirl Means a lower gear and mor polluti...,Twitter Web App,False,1.671498e+18,7.188028e+07,PetenShirl,1565050995289296896,John,...,1,134,3104,2022-08-31 18:58:21+00:00,,,False,0,en,
10,2023-06-24 20:39:56+00:00,1672706144815415296,@toryboypierce @mailplus Londoners want ULEZ\n...,Twitter for iPhone,False,1.672705e+18,1.944467e+09,toryboypierce,1512083894564122635,Mrs Kensington,...,0,9438,4711,2022-04-07 15:05:13+00:00,,,False,2,en,https://pbs.twimg.com/tweet_video_thumb/FzalXs...
11,2023-06-24 20:38:54+00:00,1672705883921326081,#LTN have reduced road space redundancy in the...,Twitter for Android,False,,,,1320381280345874433,The UK LTN Résistance,...,3,62642,54919,2020-10-25 15:08:34+00:00,,,True,3,en,
13,2023-06-24 20:33:47+00:00,1672704594655191047,@YBcabbie @suemitch2017 @BBC @Keir_Starmer @Co...,Twitter for Android,False,1.672692e+18,1.849338e+09,YBcabbie,1238431291885920259,Sir Digby,...,4,34524,28163,2020-03-13 11:47:00+00:00,,,False,0,en,https://pbs.twimg.com/media/Fzaj9tpWIAEcwJa.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11045,2023-06-26 08:21:34+00:00,1673245100951085056,RT @winrina_bbl: ⭐️💭 230626\n[5:03pm KST]\n\n“...,Twitter for iPhone,False,,,,1653434994445152256,bluerina🍒,...,0,13246,2363,2023-05-02 16:23:29+00:00,,,False,0,en,
11046,2023-06-26 08:21:33+00:00,1673245098321276928,RT @AnkitaBnsl: 5 Major Shiva Temples or Panch...,Twitter for Android,False,,,,1345042305921605635,த.நந்தகோபால் ஃ D.NANDHAGOBAL🇮🇳🚩🚩🚩,...,2,275823,98650,2021-01-01 16:21:03+00:00,,,False,0,en,
11047,2023-06-26 08:21:33+00:00,1673245098270928898,RT @gunwookiebb: gunwook ended up soaking wet ...,Twitter for Android,False,,,,1627187556029980672,WMJ,...,0,4318,177,2023-02-19 06:05:52+00:00,,,False,0,en,
11048,2023-06-26 08:21:33+00:00,1673245096790331392,RT @aespasbbl: [230626] 5:03PM KST: ⭐️ \n\n- +...,Twitter for Android,False,,,,1380391060149391369,elena,...,0,14727,7726,2021-04-09 05:24:21+00:00,,,False,0,en,


In [41]:
# data[['user_location']].to_csv('user_location2.csv', index = True)

------------

Geocoding the location column to get the longitude and latitude

-------------

In [13]:
data.shape

(7683, 25)

In [14]:
userLocation = pd.read_csv('user_location2.csv')
userLocation

Unnamed: 0.1,Unnamed: 0,user_location
0,3,"London, England"
1,5,East Cheshire
2,10,"Kensington, London"
3,11,UK
4,13,"Birmingham, England"
...,...,...
7678,11045,wintop only
7679,11046,Jubail Kingdom of Saudi Arabi
7680,11047,
7681,11048,kwangya


In [18]:
geolocator = Nominatim(user_agent="myGeocoder")
# Store the market names and their corresponding coordinates in a dictionary
market_dict = {}
for market_name in userLocation.user_location:
    try:
        location = geolocator.geocode(market_name, country_codes='GB')
        coordinates = f"({location.latitude},{location.longitude})"
        market_dict[market_name] = coordinates
    except:
        first = 'NaN'
        coordinates = f"({first},{first})"
        market_dict[market_name] = coordinates

df = pd.DataFrame(market_dict.items(), columns = ['name', 'cordinats'])
# df.to_csv('user_cordinates.csv', index = False)

In [26]:
df[df.cordinats != "(NaN,NaN)"]

Unnamed: 0,name,cordinats
0,"London, England","(51.5073359,-0.12765)"
1,East Cheshire,"(53.08951585,-2.432569348703039)"
2,"Kensington, London","(51.500841550000004,-0.17914971498845972)"
3,UK,"(54.7023545,-3.2765753)"
4,"Birmingham, England","(52.4796992,-1.9026911)"
...,...,...
2583,Poland 🇮🇩,"(51.2726139,-0.9359351)"
2594,Mushroom Forest,"(51.4268762,-0.7097463)"
2607,"Brisbane, Australia","(53.3548081,-2.1577743)"
2613,1st wife,"(51.2784665,1.0545089)"


In [20]:
# df.to_csv('user_cordinates3.csv', index = False)

In [6]:
test2 = pd.read_csv('user_cordinates1.csv')
test2

Unnamed: 0,name,cordinats
0,"London, England","(51.5073359,-0.12765)"
1,East Cheshire,"(53.08951585,-2.432569348703039)"
2,"Kensington, London","(51.500841550000004,-0.17914971498845972)"
3,UK,"(NaN,NaN)"
4,"Birmingham, England","(52.4796992,-1.9026911)"
...,...,...
2632,"اسلام آباد, پاکستان","(NaN,NaN)"
2633,sa puso ni jungkook,"(NaN,NaN)"
2634,yjm,"(NaN,NaN)"
2635,wintop only,"(NaN,NaN)"


In [7]:
test3 = pd.read_csv('user_cordinates3.csv')
test3

Unnamed: 0,name,cordinats
0,"London, England","(51.5073359,-0.12765)"
1,East Cheshire,"(53.08951585,-2.432569348703039)"
2,"Kensington, London","(51.500841550000004,-0.17914971498845972)"
3,UK,"(54.7023545,-3.2765753)"
4,"Birmingham, England","(52.4796992,-1.9026911)"
...,...,...
2632,"اسلام آباد, پاکستان","(NaN,NaN)"
2633,sa puso ni jungkook,"(NaN,NaN)"
2634,yjm,"(NaN,NaN)"
2635,wintop only,"(NaN,NaN)"


In [21]:
data_test = pd.merge(test1, test3, on = 'name', how = 'right' )
data_test.dropna(subset = ['cordinats_x']).head(50)

Unnamed: 0,name,cordinats_x,cordinats_y
0,"London, England","(51.5073359,-0.12765)","(51.5073359,-0.12765)"
1,East Cheshire,"(53.08951585,-2.432569348703039)","(53.08951585,-2.432569348703039)"
2,"Kensington, London","(51.500841550000004,-0.17914971498845972)","(51.500841550000004,-0.17914971498845972)"
3,UK,"(54.7023545,-3.2765753)","(54.7023545,-3.2765753)"
4,"Birmingham, England","(52.4796992,-1.9026911)","(52.4796992,-1.9026911)"
5,"Bury Saint Edmunds, England","(52.2478821,0.7110913)","(52.2478821,0.7110913)"
6,Rotherhithe london,"(51.5002908,-0.0436321)","(51.5002908,-0.0436321)"
7,United Kingdom,"(54.7023545,-3.2765753)","(54.7023545,-3.2765753)"
9,"West Midlands, England","(52.5050033,-1.964396123331272)","(52.5050033,-1.964396123331272)"
10,West Sussex,"(50.94458445,-0.5278477012862655)","(50.94458445,-0.5278477012862655)"


In [22]:
def extract_coordinates(df):
    def extract_latitude(coord_str):
        try:
            return float(coord_str.split(',')[0][1:])
        except (ValueError, IndexError):
            return np.nan

    def extract_longitude(coord_str):
        try:
            return float(coord_str.split(',')[1][:-1])
        except (ValueError, IndexError):
            return np.nan

    df['latitude'] = df['cordinats'].apply(extract_latitude)
    df['longitude'] = df['cordinats'].apply(extract_longitude)

# Assuming 'data' is the DataFrame that contains the 'cordinats' column
userCoordinates = pd.read_csv('user_cordinates3.csv')
extract_coordinates(userCoordinates)


In [23]:
userCoordinates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2637 entries, 0 to 2636
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       2636 non-null   object 
 1   cordinats  2637 non-null   object 
 2   latitude   645 non-null    float64
 3   longitude  645 non-null    float64
dtypes: float64(2), object(2)
memory usage: 82.5+ KB


In [25]:
userCoordinates.drop(['cordinats'], axis = 1, inplace = True)

In [26]:
userCoordinates.dropna(subset = ['latitude'], inplace = True)

In [28]:
userCoordinates.head(50)

Unnamed: 0,name,latitude,longitude
0,"London, England",51.507336,-0.12765
1,East Cheshire,53.089516,-2.432569
2,"Kensington, London",51.500842,-0.17915
3,UK,54.702354,-3.276575
4,"Birmingham, England",52.479699,-1.902691
5,"Bury Saint Edmunds, England",52.247882,0.711091
6,Rotherhithe london,51.500291,-0.043632
7,United Kingdom,54.702354,-3.276575
9,"West Midlands, England",52.505003,-1.964396
10,West Sussex,50.944584,-0.527848


In [48]:
userCoordinates.rename(columns = {'name': "user_location"}, inplace = True)
userCoordinates

Unnamed: 0,user_location,latitude,longitude
0,"London, England",51.507336,-0.127650
1,East Cheshire,53.089516,-2.432569
2,"Kensington, London",51.500842,-0.179150
3,UK,54.702354,-3.276575
4,"Birmingham, England",52.479699,-1.902691
...,...,...,...
2583,Poland 🇮🇩,51.272614,-0.935935
2594,Mushroom Forest,51.426876,-0.709746
2607,"Brisbane, Australia",53.354808,-2.157774
2613,1st wife,51.278467,1.054509


In [49]:
data = pd.merge(userCoordinates, data, on = 'user_location', how = 'right' )
data

Unnamed: 0,user_location,latitude,longitude,created_at,id,text,source,truncated,in_reply_to_status_id,in_reply_to_user_id,...,user_listed_count,user_favourites_count,user_statuses_count,user_created_at,coordinates,place,is_quote_status,favorite_count,lang,image_url
0,"London, England",51.507336,-0.127650,2023-06-24 21:16:45+00:00,1672715409433190400,@HothfieldPlace All that pollution what “appar...,Twitter for iPhone,False,1.672699e+18,1.406968e+18,...,0,312,704,2022-12-20 15:52:12+00:00,,,False,0,en,
1,East Cheshire,53.089516,-2.432569,2023-06-24 21:08:08+00:00,1672713238255992834,@PetenShirl Means a lower gear and mor polluti...,Twitter Web App,False,1.671498e+18,7.188028e+07,...,1,134,3104,2022-08-31 18:58:21+00:00,,,False,0,en,
2,"Kensington, London",51.500842,-0.179150,2023-06-24 20:39:56+00:00,1672706144815415296,@toryboypierce @mailplus Londoners want ULEZ\n...,Twitter for iPhone,False,1.672705e+18,1.944467e+09,...,0,9438,4711,2022-04-07 15:05:13+00:00,,,False,2,en,https://pbs.twimg.com/tweet_video_thumb/FzalXs...
3,UK,54.702354,-3.276575,2023-06-24 20:38:54+00:00,1672705883921326081,#LTN have reduced road space redundancy in the...,Twitter for Android,False,,,...,3,62642,54919,2020-10-25 15:08:34+00:00,,,True,3,en,
4,"Birmingham, England",52.479699,-1.902691,2023-06-24 20:33:47+00:00,1672704594655191047,@YBcabbie @suemitch2017 @BBC @Keir_Starmer @Co...,Twitter for Android,False,1.672692e+18,1.849338e+09,...,4,34524,28163,2020-03-13 11:47:00+00:00,,,False,0,en,https://pbs.twimg.com/media/Fzaj9tpWIAEcwJa.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7678,wintop only,,,2023-06-26 08:21:34+00:00,1673245100951085056,RT @winrina_bbl: ⭐️💭 230626\n[5:03pm KST]\n\n“...,Twitter for iPhone,False,,,...,0,13246,2363,2023-05-02 16:23:29+00:00,,,False,0,en,
7679,Jubail Kingdom of Saudi Arabi,,,2023-06-26 08:21:33+00:00,1673245098321276928,RT @AnkitaBnsl: 5 Major Shiva Temples or Panch...,Twitter for Android,False,,,...,2,275823,98650,2021-01-01 16:21:03+00:00,,,False,0,en,
7680,,,,2023-06-26 08:21:33+00:00,1673245098270928898,RT @gunwookiebb: gunwook ended up soaking wet ...,Twitter for Android,False,,,...,0,4318,177,2023-02-19 06:05:52+00:00,,,False,0,en,
7681,kwangya,,,2023-06-26 08:21:33+00:00,1673245096790331392,RT @aespasbbl: [230626] 5:03PM KST: ⭐️ \n\n- +...,Twitter for Android,False,,,...,0,14727,7726,2021-04-09 05:24:21+00:00,,,False,0,en,


In [50]:
data.lang.unique()

array(['en', 'fr', 'und'], dtype=object)

In [51]:
other_lang = data[data.lang != 'en']
other_lang.reset_index(drop=True, inplace=True )
other_lang

Unnamed: 0,user_location,latitude,longitude,created_at,id,text,source,truncated,in_reply_to_status_id,in_reply_to_user_id,...,user_listed_count,user_favourites_count,user_statuses_count,user_created_at,coordinates,place,is_quote_status,favorite_count,lang,image_url
0,"Paris, France",,,2023-06-24 19:12:02+00:00,1672684020583346176,"""Sain, naturel, bio"". Mise en vitrine de la su...",Twitter for Android,False,,,...,0,26,1,2023-06-20 08:14:35+00:00,,,False,0,fr,https://pbs.twimg.com/media/FzaRP6hXsAQzfOx.jpg
1,West Sussex,50.944584,-0.527848,2023-06-24 18:40:31+00:00,1672676090245001216,Is #LightPollution #Pollution? @ChiArunGreen @...,Twitter Web App,False,,,...,4,3082,1727,2022-02-14 11:20:49+00:00,,,True,1,und,
2,75001,51.591018,0.082889,2023-06-24 17:55:08+00:00,1672664667808845824,@medhi9401 @mairie15 @Paris @Space_Station @ES...,Twitter for Android,False,1.555227e+18,1.491747e+18,...,1,35,197,2023-01-05 23:50:24+00:00,,"Drancy, France",False,0,fr,
3,"Honfleur, France",,,2023-06-24 15:06:12+00:00,1672622155471749121,@AlLouarn @LJacouille @lemondefr https://t.co/...,Twitter for Android,False,1.672594e+18,1.028613e+18,...,1,4644,5325,2023-03-16 23:57:02+00:00,,,False,1,und,
4,"Paris, France",,,2023-06-24 13:34:44+00:00,1672599136514850817,"@GeWoessner @brounno @marinetondelier Euh, il ...",Twitter for Android,False,1.672259e+18,1.222531e+08,...,3,5113,3471,2021-11-29 23:19:01+00:00,,,False,1,fr,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,Auvergne,54.421697,-1.234967,2023-06-19 02:02:15+00:00,1670612929178116097,@JackyBerland @BonGrosDodo Cela concerne la ch...,Twitter Web App,False,1.670613e+18,1.466109e+18,...,8,9420,6602,2021-12-01 18:15:45+00:00,,,False,1,fr,
141,"Boulogne-Billancourt, France",,,2023-06-18 22:05:34+00:00,1670553363966984196,Bah c’est super ça dis donc. Donc en plus de l...,Twitter for iPhone,False,,,...,0,1939,3378,2022-10-01 17:19:03+00:00,,"Paris, France",True,3,fr,
142,"Paris, France",,,2023-06-18 19:13:07+00:00,1670509969148354560,@pascalCenteam @f_philippot Je réponds pour ré...,Twitter for iPhone,False,1.670505e+18,8.114870e+08,...,1,27016,54241,2021-10-04 19:00:22+00:00,,,True,0,fr,
143,"La Roche-sur-Yon, France",,,2023-06-18 18:35:13+00:00,1670500430122561542,@Bruno_Attal_ Et la pollution grosse merde tou...,Twitter for Android,False,1.670438e+18,1.433049e+18,...,0,182,405,2023-03-26 20:27:23+00:00,,,False,1,fr,


# translation

In [52]:
import openai
api_key = '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'
openai.api_key = api_key

def translate_text(text, source_language, target_language='en'):
    prompt = f"Translate the following {source_language} text to {target_language}: '{text}'"
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "system", "content": prompt}],
        temperature=0,
        max_tokens=100
    )

    # Extracting the translated text from the API response
    translated_text = response['choices'][0]['message']['content']
    return translated_text.strip()


# Function to translate each row in the DataFrame
def translate_row(row):
    if row['lang'] != 'en':  # Check if the language is already English
        translated_text = translate_text(row['text'], row['lang'], 'en')
    else:
        translated_text = row['text']  # If the language is already English, keep the original text
    return translated_text

# Create a new column 'translated_text' with translated text
other_lang['translated_text'] = other_lang.apply(translate_row, axis=1)

other_lang


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  other_lang['translated_text'] = other_lang.apply(translate_row, axis=1)


Unnamed: 0,user_location,latitude,longitude,created_at,id,text,source,truncated,in_reply_to_status_id,in_reply_to_user_id,...,user_favourites_count,user_statuses_count,user_created_at,coordinates,place,is_quote_status,favorite_count,lang,image_url,translated_text
0,"Paris, France",,,2023-06-24 19:12:02+00:00,1672684020583346176,"""Sain, naturel, bio"". Mise en vitrine de la su...",Twitter for Android,False,,,...,26,1,2023-06-20 08:14:35+00:00,,,False,0,fr,https://pbs.twimg.com/media/FzaRP6hXsAQzfOx.jpg,"""Healthy, natural, organic"". Showcasing the ov..."
1,West Sussex,50.944584,-0.527848,2023-06-24 18:40:31+00:00,1672676090245001216,Is #LightPollution #Pollution? @ChiArunGreen @...,Twitter Web App,False,,,...,3082,1727,2022-02-14 11:20:49+00:00,,,True,1,und,,Is #LightPollution #Pollution? @ChiArunGreen @...
2,75001,51.591018,0.082889,2023-06-24 17:55:08+00:00,1672664667808845824,@medhi9401 @mairie15 @Paris @Space_Station @ES...,Twitter for Android,False,1.555227e+18,1.491747e+18,...,35,197,2023-01-05 23:50:24+00:00,,"Drancy, France",False,0,fr,,'@medhi9401 @mairie15 @Paris @Space_Station @E...
3,"Honfleur, France",,,2023-06-24 15:06:12+00:00,1672622155471749121,@AlLouarn @LJacouille @lemondefr https://t.co/...,Twitter for Android,False,1.672594e+18,1.028613e+18,...,4644,5325,2023-03-16 23:57:02+00:00,,,False,1,und,,'@AlLouarn @LJacouille @lemondefr https://t.co...
4,"Paris, France",,,2023-06-24 13:34:44+00:00,1672599136514850817,"@GeWoessner @brounno @marinetondelier Euh, il ...",Twitter for Android,False,1.672259e+18,1.222531e+08,...,5113,3471,2021-11-29 23:19:01+00:00,,,False,1,fr,,"'@GeWoessner @brounno @marinetondelier Um, whe..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,Auvergne,54.421697,-1.234967,2023-06-19 02:02:15+00:00,1670612929178116097,@JackyBerland @BonGrosDodo Cela concerne la ch...,Twitter Web App,False,1.670613e+18,1.466109e+18,...,9420,6602,2021-12-01 18:15:45+00:00,,,False,1,fr,,"""@JackyBerland @BonGrosDodo This concerns the ..."
141,"Boulogne-Billancourt, France",,,2023-06-18 22:05:34+00:00,1670553363966984196,Bah c’est super ça dis donc. Donc en plus de l...,Twitter for iPhone,False,,,...,1939,3378,2022-10-01 17:19:03+00:00,,"Paris, France",True,3,fr,,"Well, that's great, I must say. So, in additio..."
142,"Paris, France",,,2023-06-18 19:13:07+00:00,1670509969148354560,@pascalCenteam @f_philippot Je réponds pour ré...,Twitter for iPhone,False,1.670505e+18,8.114870e+08,...,27016,54241,2021-10-04 19:00:22+00:00,,,True,0,fr,,"""@pascalCenteam @f_philippot I'm responding to..."
143,"La Roche-sur-Yon, France",,,2023-06-18 18:35:13+00:00,1670500430122561542,@Bruno_Attal_ Et la pollution grosse merde tou...,Twitter for Android,False,1.670438e+18,1.433049e+18,...,182,405,2023-03-26 20:27:23+00:00,,,False,1,fr,,"'@Bruno_Attal_ And pollution, what a big mess,..."


In [53]:
other_lang.drop(('text'), axis = 1, inplace = True)
other_lang

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  other_lang.drop(('text'), axis = 1, inplace = True)


Unnamed: 0,user_location,latitude,longitude,created_at,id,source,truncated,in_reply_to_status_id,in_reply_to_user_id,in_reply_to_screen_name,...,user_favourites_count,user_statuses_count,user_created_at,coordinates,place,is_quote_status,favorite_count,lang,image_url,translated_text
0,"Paris, France",,,2023-06-24 19:12:02+00:00,1672684020583346176,Twitter for Android,False,,,,...,26,1,2023-06-20 08:14:35+00:00,,,False,0,fr,https://pbs.twimg.com/media/FzaRP6hXsAQzfOx.jpg,"""Healthy, natural, organic"". Showcasing the ov..."
1,West Sussex,50.944584,-0.527848,2023-06-24 18:40:31+00:00,1672676090245001216,Twitter Web App,False,,,,...,3082,1727,2022-02-14 11:20:49+00:00,,,True,1,und,,Is #LightPollution #Pollution? @ChiArunGreen @...
2,75001,51.591018,0.082889,2023-06-24 17:55:08+00:00,1672664667808845824,Twitter for Android,False,1.555227e+18,1.491747e+18,medhi9401,...,35,197,2023-01-05 23:50:24+00:00,,"Drancy, France",False,0,fr,,'@medhi9401 @mairie15 @Paris @Space_Station @E...
3,"Honfleur, France",,,2023-06-24 15:06:12+00:00,1672622155471749121,Twitter for Android,False,1.672594e+18,1.028613e+18,AlLouarn,...,4644,5325,2023-03-16 23:57:02+00:00,,,False,1,und,,'@AlLouarn @LJacouille @lemondefr https://t.co...
4,"Paris, France",,,2023-06-24 13:34:44+00:00,1672599136514850817,Twitter for Android,False,1.672259e+18,1.222531e+08,GeWoessner,...,5113,3471,2021-11-29 23:19:01+00:00,,,False,1,fr,,"'@GeWoessner @brounno @marinetondelier Um, whe..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,Auvergne,54.421697,-1.234967,2023-06-19 02:02:15+00:00,1670612929178116097,Twitter Web App,False,1.670613e+18,1.466109e+18,AdrienBalG,...,9420,6602,2021-12-01 18:15:45+00:00,,,False,1,fr,,"""@JackyBerland @BonGrosDodo This concerns the ..."
141,"Boulogne-Billancourt, France",,,2023-06-18 22:05:34+00:00,1670553363966984196,Twitter for iPhone,False,,,,...,1939,3378,2022-10-01 17:19:03+00:00,,"Paris, France",True,3,fr,,"Well, that's great, I must say. So, in additio..."
142,"Paris, France",,,2023-06-18 19:13:07+00:00,1670509969148354560,Twitter for iPhone,False,1.670505e+18,8.114870e+08,pascalCenteam,...,27016,54241,2021-10-04 19:00:22+00:00,,,True,0,fr,,"""@pascalCenteam @f_philippot I'm responding to..."
143,"La Roche-sur-Yon, France",,,2023-06-18 18:35:13+00:00,1670500430122561542,Twitter for Android,False,1.670438e+18,1.433049e+18,Bruno_Attal_,...,182,405,2023-03-26 20:27:23+00:00,,,False,1,fr,,"'@Bruno_Attal_ And pollution, what a big mess,..."


In [54]:
# df = data.lang != 'en'
# data = data[~df]
# data[data.lang != 'en']

In [55]:
other_lang.rename(columns = {'translated_text': 'text'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  other_lang.rename(columns = {'translated_text': 'text'}, inplace = True)


In [56]:
other_lang

Unnamed: 0,user_location,latitude,longitude,created_at,id,source,truncated,in_reply_to_status_id,in_reply_to_user_id,in_reply_to_screen_name,...,user_favourites_count,user_statuses_count,user_created_at,coordinates,place,is_quote_status,favorite_count,lang,image_url,text
0,"Paris, France",,,2023-06-24 19:12:02+00:00,1672684020583346176,Twitter for Android,False,,,,...,26,1,2023-06-20 08:14:35+00:00,,,False,0,fr,https://pbs.twimg.com/media/FzaRP6hXsAQzfOx.jpg,"""Healthy, natural, organic"". Showcasing the ov..."
1,West Sussex,50.944584,-0.527848,2023-06-24 18:40:31+00:00,1672676090245001216,Twitter Web App,False,,,,...,3082,1727,2022-02-14 11:20:49+00:00,,,True,1,und,,Is #LightPollution #Pollution? @ChiArunGreen @...
2,75001,51.591018,0.082889,2023-06-24 17:55:08+00:00,1672664667808845824,Twitter for Android,False,1.555227e+18,1.491747e+18,medhi9401,...,35,197,2023-01-05 23:50:24+00:00,,"Drancy, France",False,0,fr,,'@medhi9401 @mairie15 @Paris @Space_Station @E...
3,"Honfleur, France",,,2023-06-24 15:06:12+00:00,1672622155471749121,Twitter for Android,False,1.672594e+18,1.028613e+18,AlLouarn,...,4644,5325,2023-03-16 23:57:02+00:00,,,False,1,und,,'@AlLouarn @LJacouille @lemondefr https://t.co...
4,"Paris, France",,,2023-06-24 13:34:44+00:00,1672599136514850817,Twitter for Android,False,1.672259e+18,1.222531e+08,GeWoessner,...,5113,3471,2021-11-29 23:19:01+00:00,,,False,1,fr,,"'@GeWoessner @brounno @marinetondelier Um, whe..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,Auvergne,54.421697,-1.234967,2023-06-19 02:02:15+00:00,1670612929178116097,Twitter Web App,False,1.670613e+18,1.466109e+18,AdrienBalG,...,9420,6602,2021-12-01 18:15:45+00:00,,,False,1,fr,,"""@JackyBerland @BonGrosDodo This concerns the ..."
141,"Boulogne-Billancourt, France",,,2023-06-18 22:05:34+00:00,1670553363966984196,Twitter for iPhone,False,,,,...,1939,3378,2022-10-01 17:19:03+00:00,,"Paris, France",True,3,fr,,"Well, that's great, I must say. So, in additio..."
142,"Paris, France",,,2023-06-18 19:13:07+00:00,1670509969148354560,Twitter for iPhone,False,1.670505e+18,8.114870e+08,pascalCenteam,...,27016,54241,2021-10-04 19:00:22+00:00,,,True,0,fr,,"""@pascalCenteam @f_philippot I'm responding to..."
143,"La Roche-sur-Yon, France",,,2023-06-18 18:35:13+00:00,1670500430122561542,Twitter for Android,False,1.670438e+18,1.433049e+18,Bruno_Attal_,...,182,405,2023-03-26 20:27:23+00:00,,,False,1,fr,,"'@Bruno_Attal_ And pollution, what a big mess,..."


In [61]:
df = data.lang != 'en'
data = data[~df]
data

Unnamed: 0,user_location,latitude,longitude,created_at,id,text,source,truncated,in_reply_to_status_id,in_reply_to_user_id,...,user_listed_count,user_favourites_count,user_statuses_count,user_created_at,coordinates,place,is_quote_status,favorite_count,lang,image_url
0,"London, England",51.507336,-0.127650,2023-06-24 21:16:45+00:00,1672715409433190400,@HothfieldPlace All that pollution what “appar...,Twitter for iPhone,False,1.672699e+18,1.406968e+18,...,0,312,704,2022-12-20 15:52:12+00:00,,,False,0,en,
1,East Cheshire,53.089516,-2.432569,2023-06-24 21:08:08+00:00,1672713238255992834,@PetenShirl Means a lower gear and mor polluti...,Twitter Web App,False,1.671498e+18,7.188028e+07,...,1,134,3104,2022-08-31 18:58:21+00:00,,,False,0,en,
2,"Kensington, London",51.500842,-0.179150,2023-06-24 20:39:56+00:00,1672706144815415296,@toryboypierce @mailplus Londoners want ULEZ\n...,Twitter for iPhone,False,1.672705e+18,1.944467e+09,...,0,9438,4711,2022-04-07 15:05:13+00:00,,,False,2,en,https://pbs.twimg.com/tweet_video_thumb/FzalXs...
3,UK,54.702354,-3.276575,2023-06-24 20:38:54+00:00,1672705883921326081,#LTN have reduced road space redundancy in the...,Twitter for Android,False,,,...,3,62642,54919,2020-10-25 15:08:34+00:00,,,True,3,en,
4,"Birmingham, England",52.479699,-1.902691,2023-06-24 20:33:47+00:00,1672704594655191047,@YBcabbie @suemitch2017 @BBC @Keir_Starmer @Co...,Twitter for Android,False,1.672692e+18,1.849338e+09,...,4,34524,28163,2020-03-13 11:47:00+00:00,,,False,0,en,https://pbs.twimg.com/media/Fzaj9tpWIAEcwJa.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7678,wintop only,,,2023-06-26 08:21:34+00:00,1673245100951085056,RT @winrina_bbl: ⭐️💭 230626\n[5:03pm KST]\n\n“...,Twitter for iPhone,False,,,...,0,13246,2363,2023-05-02 16:23:29+00:00,,,False,0,en,
7679,Jubail Kingdom of Saudi Arabi,,,2023-06-26 08:21:33+00:00,1673245098321276928,RT @AnkitaBnsl: 5 Major Shiva Temples or Panch...,Twitter for Android,False,,,...,2,275823,98650,2021-01-01 16:21:03+00:00,,,False,0,en,
7680,,,,2023-06-26 08:21:33+00:00,1673245098270928898,RT @gunwookiebb: gunwook ended up soaking wet ...,Twitter for Android,False,,,...,0,4318,177,2023-02-19 06:05:52+00:00,,,False,0,en,
7681,kwangya,,,2023-06-26 08:21:33+00:00,1673245096790331392,RT @aespasbbl: [230626] 5:03PM KST: ⭐️ \n\n- +...,Twitter for Android,False,,,...,0,14727,7726,2021-04-09 05:24:21+00:00,,,False,0,en,


In [62]:
df = pd.concat((data, other_lang), axis = 0)

In [63]:
df

Unnamed: 0,user_location,latitude,longitude,created_at,id,text,source,truncated,in_reply_to_status_id,in_reply_to_user_id,...,user_listed_count,user_favourites_count,user_statuses_count,user_created_at,coordinates,place,is_quote_status,favorite_count,lang,image_url
0,"London, England",51.507336,-0.127650,2023-06-24 21:16:45+00:00,1672715409433190400,@HothfieldPlace All that pollution what “appar...,Twitter for iPhone,False,1.672699e+18,1.406968e+18,...,0,312,704,2022-12-20 15:52:12+00:00,,,False,0,en,
1,East Cheshire,53.089516,-2.432569,2023-06-24 21:08:08+00:00,1672713238255992834,@PetenShirl Means a lower gear and mor polluti...,Twitter Web App,False,1.671498e+18,7.188028e+07,...,1,134,3104,2022-08-31 18:58:21+00:00,,,False,0,en,
2,"Kensington, London",51.500842,-0.179150,2023-06-24 20:39:56+00:00,1672706144815415296,@toryboypierce @mailplus Londoners want ULEZ\n...,Twitter for iPhone,False,1.672705e+18,1.944467e+09,...,0,9438,4711,2022-04-07 15:05:13+00:00,,,False,2,en,https://pbs.twimg.com/tweet_video_thumb/FzalXs...
3,UK,54.702354,-3.276575,2023-06-24 20:38:54+00:00,1672705883921326081,#LTN have reduced road space redundancy in the...,Twitter for Android,False,,,...,3,62642,54919,2020-10-25 15:08:34+00:00,,,True,3,en,
4,"Birmingham, England",52.479699,-1.902691,2023-06-24 20:33:47+00:00,1672704594655191047,@YBcabbie @suemitch2017 @BBC @Keir_Starmer @Co...,Twitter for Android,False,1.672692e+18,1.849338e+09,...,4,34524,28163,2020-03-13 11:47:00+00:00,,,False,0,en,https://pbs.twimg.com/media/Fzaj9tpWIAEcwJa.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,Auvergne,54.421697,-1.234967,2023-06-19 02:02:15+00:00,1670612929178116097,"""@JackyBerland @BonGrosDodo This concerns the ...",Twitter Web App,False,1.670613e+18,1.466109e+18,...,8,9420,6602,2021-12-01 18:15:45+00:00,,,False,1,fr,
141,"Boulogne-Billancourt, France",,,2023-06-18 22:05:34+00:00,1670553363966984196,"Well, that's great, I must say. So, in additio...",Twitter for iPhone,False,,,...,0,1939,3378,2022-10-01 17:19:03+00:00,,"Paris, France",True,3,fr,
142,"Paris, France",,,2023-06-18 19:13:07+00:00,1670509969148354560,"""@pascalCenteam @f_philippot I'm responding to...",Twitter for iPhone,False,1.670505e+18,8.114870e+08,...,1,27016,54241,2021-10-04 19:00:22+00:00,,,True,0,fr,
143,"La Roche-sur-Yon, France",,,2023-06-18 18:35:13+00:00,1670500430122561542,"'@Bruno_Attal_ And pollution, what a big mess,...",Twitter for Android,False,1.670438e+18,1.433049e+18,...,0,182,405,2023-03-26 20:27:23+00:00,,,False,1,fr,


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7683 entries, 0 to 144
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype              
---  ------                   --------------  -----              
 0   user_location            4147 non-null   object             
 1   latitude                 1438 non-null   float64            
 2   longitude                1438 non-null   float64            
 3   created_at               7683 non-null   object             
 4   id                       7683 non-null   int64              
 5   text                     7683 non-null   object             
 6   source                   7683 non-null   object             
 7   truncated                7683 non-null   bool               
 8   in_reply_to_status_id    2265 non-null   float64            
 9   in_reply_to_user_id      2319 non-null   float64            
 10  in_reply_to_screen_name  2319 non-null   object             
 11  user_id                  7683 n

In [65]:
# df.to_csv('final_twitterDatanew.csv', index = False)

In [155]:
df

Unnamed: 0,user_location,cordinats,longitude,latitude,created_at,id,text,source,truncated,in_reply_to_status_id,...,user_listed_count,user_favourites_count,user_statuses_count,user_created_at,coordinates,place,is_quote_status,favorite_count,lang,image_url
0,"London, England","(51.5073359,-0.12765)",0.127650,51.507336,2023-06-24 21:16:45+00:00,1672715409433190400,@HothfieldPlace All that pollution what “appar...,Twitter for iPhone,False,1.672699e+18,...,0,312,704,2022-12-20 15:52:12+00:00,,,False,0,en,
1,East Cheshire,"(53.08951585,-2.432569348703039)",2.432569,53.089516,2023-06-24 21:08:08+00:00,1672713238255992834,@PetenShirl Means a lower gear and mor polluti...,Twitter Web App,False,1.671498e+18,...,1,134,3104,2022-08-31 18:58:21+00:00,,,False,0,en,
2,"Kensington, London","(51.500841550000004,-0.17914971498845972)",0.179150,51.500842,2023-06-24 20:39:56+00:00,1672706144815415296,@toryboypierce @mailplus Londoners want ULEZ\n...,Twitter for iPhone,False,1.672705e+18,...,0,9438,4711,2022-04-07 15:05:13+00:00,,,False,2,en,https://pbs.twimg.com/tweet_video_thumb/FzalXs...
3,UK,"(54.7023545,-3.2765753)",3.276575,54.702354,2023-06-24 20:38:54+00:00,1672705883921326081,#LTN have reduced road space redundancy in the...,Twitter for Android,False,,...,3,62642,54919,2020-10-25 15:08:34+00:00,,,True,3,en,
4,"Birmingham, England","(52.4796992,-1.9026911)",1.902691,52.479699,2023-06-24 20:33:47+00:00,1672704594655191047,@YBcabbie @suemitch2017 @BBC @Keir_Starmer @Co...,Twitter for Android,False,1.672692e+18,...,4,34524,28163,2020-03-13 11:47:00+00:00,,,False,0,en,https://pbs.twimg.com/media/Fzaj9tpWIAEcwJa.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3985,Auvergne,,,,2023-06-19 02:02:15+00:00,1670612929178116097,@JackyBerland @BonGrosDodo Cela concerne la ch...,Twitter Web App,False,1.670613e+18,...,8,9420,6602,2021-12-01 18:15:45+00:00,,,False,1,fr,
4016,"Boulogne-Billancourt, France",,,,2023-06-18 22:05:34+00:00,1670553363966984196,Bah c’est super ça dis donc. Donc en plus de l...,Twitter for iPhone,False,,...,0,1939,3378,2022-10-01 17:19:03+00:00,,"Paris, France",True,3,fr,
4093,"Paris, France",,,,2023-06-18 19:13:07+00:00,1670509969148354560,@pascalCenteam @f_philippot Je réponds pour ré...,Twitter for iPhone,False,1.670505e+18,...,1,27016,54241,2021-10-04 19:00:22+00:00,,,True,0,fr,
4110,"La Roche-sur-Yon, France",,,,2023-06-18 18:35:13+00:00,1670500430122561542,@Bruno_Attal_ Et la pollution grosse merde tou...,Twitter for Android,False,1.670438e+18,...,0,182,405,2023-03-26 20:27:23+00:00,,,False,1,fr,
