In [65]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words
import re
import json
from collections import Counter
import operator
import string
import gensim
import nltk
import re
from nltk.data import find
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
stop = set(stopwords.words('english'))
punctuation = set(string.punctuation)
nltk_words = set([x.lower() for x in words.words()])
json_dict={}

emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""

regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs

    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)

def tokenize(s):
    return tokens_re.findall(s)

def wordLemmatizer(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma    
def max_match(s):
    if not s:
        return []
    for i in range(len(s), 0, -1):
        firstword = wordLemmatizer(s[:i])
        remainer = s[i:]
        if firstword in nltk_words:
            return [firstword] + max_match(remainer)
    firstword = s[0]
    remainer = s[1:]
    return [firstword] + max_match(remainer)
 

def preprocess(s, lowercase=True):
    hash_tags = [str(x) for x in re.findall(r"#\w+", s)]
    cleaned_tags = []
    for hash_tag in hash_tags:
        hash_tag = hash_tag.replace("#","")
        words = re.split(r'([A-Z][a-z]*)', hash_tag)
        if len(words) > 1 :
            cleaned_tags += [x.lower() for x in words if x] # remove empty strings
        elif len(words) == 1:
            cleaned_tags += max_match(words[0])
    hashString=' '.join(cleaned_tags)
    #remove url hashtag and @
    s = re.sub(r'(https?:\/\/[a-zA-Z0-9.\/\?=#]*|#\w+|@\w+)', '', s, flags=re.MULTILINE)
    s=' '.join((s,hashString))
    s=s.lower()
    #s = ''.join(ch for ch in s if ch not in punctuation)
    tokens = tokenize(s)
    tokens = [wordLemmatizer(x) for x in tokens]
    tokens = [ word for word in tokens if word not in stop ]
    return tokens

    
# tweet = "RT @marcobonzanini: just an example! :D http://example.com #NLP"
# print(preprocess(tweet))
# ['RT', '@marcobonzanini', ':', 'just', 'an', 'example', '!', ':D', 'http://example.com', '#NLP']

def polarity_classification(tweetTokens, positive_words_set, negative_words_set):
    '''
    rtype: [int]
    '''
    sentimentPredictions = []
    positiveCount = 0
    negativeCount = 0
   
    for word in tweetTokens:
        if word in positive_words_set:
            positiveCount += 1
        elif word in negative_words_set:
            negativeCount += 1

    if positiveCount > negativeCount:
        polarity_score=1
    elif positiveCount <negativeCount:
        polarity_score=-1
    else:
        polarity_score=0
    return polarity_score        

def download_sample_nltk_corpus():
    nltk.download('word2vec_sample')


#
#Tokenizing the processed tweets (tweet text)
#
def posNegWordList():
    positive_seeds = ["good","nice","excellent","positive","fortunate","correct","superior","great"]
    negative_seeds = ["bad","nasty","poor","negative","unfortunate","wrong","inferior","awful"]


    word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
    model = gensim.models.Word2Vec.load_word2vec_format(word2vec_sample, binary=False)
    positive_words2 = []
    negative_words2 = []

    for word in model.vocab:
        positive_score = 0.0
        negative_score = 0.0
        for seed in positive_seeds:
            positive_score += model.similarity(word, seed)
        for seed in negative_seeds:
            negative_score += model.similarity(word, seed)
        overall_score = (positive_score  - negative_score) / (len(negative_seeds) + len(positive_seeds))
        if overall_score < -0.03:
            negative_words2.append(word)
        if overall_score > 0.03:
            positive_words2.append(word)
    return positive_words2, negative_words2
 
# if tweet contains emoticons no tweet analysis otherwise apply sentiment analysis.
def processSentimentAnnlysis(tweet_text,positive_words,negative_words):
    # due to lower case :D convert to :d
    positive_words2=positive_words
    negative_words2=negative_words
    patternSmile = re.compile("^(\|?>?[:*;Xx8=]-?o?\^?[DdPpb3)}\]>]\)?)$")
    patternSad = re.compile("^(([:><].?-?[@><cC(\[{\|]\|?|[Dd][:8;=X]<?|v.v))$")
    #tweet = json.loads(line)
    tweet=tweet_text
    #tokens = preprocess(tweet['text'])
    tokens = preprocess(tweet)
    smiley=False
    for x in tokens:
        try:
            if patternSmile.match(x):
                polarity_final_score=1  
                smiley=True
                break
            elif patternSad.match(x):
                polarity_final_score=-1
                smiley=True
                break   
        except:
            print ('')
    if not smiley :
        tokens = [''.join(char for char in stringToken if char not in punctuation) for stringToken in tokens]
        polarity_score=polarity_classification(tokens, set(positive_words2), set(negative_words2))
        polarity_final_score=polarity_score
    return polarity_final_score


In [66]:
def read_db_config(filename, section):
    """ Read database configuration file and return a dictionary object
    :param filename: name of the configuration file
    :param section: section of database configuration
    :return: a dictionary of database parameters
    """
    # create parser and read ini configuration file
    parser = ConfigParser()
    parser.read(filename)
 
    # get section, default to mysql
    db = {}
    if parser.has_section(section):
        items = parser.items(section)
        for item in items:
            db[item[0]] = item[1]
    else:
        raise Exception('{0} not found in the {1} file'.format(section, filename))
 
    return db

In [None]:
read_db_config('config.ini','mysql')

In [176]:
from mysql.connector import MySQLConnection, Error
 
def insert_tweetMelbourne(conn,tweet_id,created_at,geo_type,geo_coordinates_latitude,geo_coordinates_longitude,place_full_name,
                          place_country,place_type,place_bounding_box_type,place_bounding_box_coordinates_NE_lat,
                          place_bounding_box_coordinates_NE_long,place_bounding_box_coordinates_SW_lat,
                          place_bounding_box_coordinates_SW_long,place_country_code,place_name,text,user_id,user_verified,
                          user_followers_count,user_listed_count,user_friends_count,user_location,user_following,
                          user_geo_enabled,user_lang,polarity_score):
    query = "INSERT INTO tweetMelbourne(tweet_id,created_at,geo_type,geo_coordinates_latitude,geo_coordinates_longitude,place_full_name,"\
            "place_country,place_type,place_bounding_box_type,place_bounding_box_coordinates_NE_lat,"\
            "place_bounding_box_coordinates_NE_long,place_bounding_box_coordinates_SW_lat,"\
            "place_bounding_box_coordinates_SW_long,place_country_code,place_name,text,user_id,user_verified,"\
            "user_followers_count,user_listed_count,user_friends_count,user_location,user_following,"\
            "user_geo_enabled,user_lang,polarity_score) " \
            "VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
    args = (tweet_id,created_at,geo_type,geo_coordinates_latitude,geo_coordinates_longitude,place_full_name,
            place_country,place_type,place_bounding_box_type,place_bounding_box_coordinates_NE_lat,
            place_bounding_box_coordinates_NE_long,place_bounding_box_coordinates_SW_lat,
            place_bounding_box_coordinates_SW_long,place_country_code,place_name,text,user_id,user_verified,
            user_followers_count,user_listed_count,user_friends_count,user_location,user_following,
            user_geo_enabled,user_lang,polarity_score)
 
    try:        
        cursor = conn.cursor()
        cursor.execute(query, args)
 
        if cursor.lastrowid%10000==0:
            print('last insert id', cursor.lastrowid)
 
        conn.commit()
    except Error as error:
        print(error)

In [177]:
import time
def add_data(fileName):
    filename=fileName
    positive_words2,negative_words2 = posNegWordList()
    tweets = []
    labels = []
    f = open(filename)
    print('Connecting to MySQL database...')
    conn = mysql.connector.connect(host='localhost',
                                       database='aircasting_development',
                                       user='root',
                                       password='')
    if conn.is_connected():
        print('connection established.')
    else:
        print('connection failed.')
    for line in f:
        tweet_dict = json.loads(line)
        polarity_final_score=processSentimentAnnlysis(tweet_dict["value"]["text"],positive_words2,negative_words2)
        try:
            tweet_id=tweet_dict["id"]
            created_at=time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(tweet_dict["value"]["created_at"],'%a %b %d %H:%M:%S +0000 %Y'))
            geo_type= tweet_dict["value"]["geo"]["type"]
            geo_coordinates_latitude= tweet_dict["value"]["geo"]["coordinates"][0]
            geo_coordinates_longitude= tweet_dict["value"]["geo"]["coordinates"][1]
            place_full_name= tweet_dict["value"]["place"]["full_name"]
            place_country= tweet_dict["value"]["place"]["country"]
            place_type= tweet_dict["value"]["place"]["place_type"]
            place_bounding_box_type= tweet_dict["value"]["place"]["bounding_box"]["type"]
            # North East (lat, long)
            place_bounding_box_coordinates_NE_lat= tweet_dict["value"]["place"]["bounding_box"]["coordinates"][0][0][1]
            place_bounding_box_coordinates_NE_long= tweet_dict["value"]["place"]["bounding_box"]["coordinates"][0][0][0]
            # South West (lat,long)
            place_bounding_box_coordinates_SW_lat= tweet_dict["value"]["place"]["bounding_box"]["coordinates"][0][2][1]
            place_bounding_box_coordinates_SW_long= tweet_dict["value"]["place"]["bounding_box"]["coordinates"][0][2][0]
            place_country_code= tweet_dict["value"]["place"]["country_code"]
            place_name= tweet_dict["value"]["place"]["name"]
            text= tweet_dict["value"]["text"]
            user_id= tweet_dict["value"]["user"]["id"]
            user_verified= tweet_dict["value"]["user"]["verified"]
            user_followers_count= tweet_dict["value"]["user"]["followers_count"]
            user_listed_count= tweet_dict["value"]["user"]["listed_count"]
            user_friends_count= tweet_dict["value"]["user"]["friends_count"]
            user_location= tweet_dict["value"]["user"]["location"]
            user_following= tweet_dict["value"]["user"]["following"]
            user_geo_enabled= tweet_dict["value"]["user"]["geo_enabled"]
            user_lang= tweet_dict["value"]["user"]["lang"]
            polarity_score= polarity_final_score
        except:
            pass    
        insert_tweetMelbourne(conn,tweet_id,created_at,geo_type,geo_coordinates_latitude,geo_coordinates_longitude,place_full_name,
                              place_country,place_type,place_bounding_box_type,place_bounding_box_coordinates_NE_lat,
                              place_bounding_box_coordinates_NE_long,place_bounding_box_coordinates_SW_lat,
                              place_bounding_box_coordinates_SW_long,place_country_code,place_name,text,user_id,user_verified,
                              user_followers_count,user_listed_count,user_friends_count,user_location,user_following,
                              user_geo_enabled,user_lang,polarity_score)

In [180]:
add_data('tweet1Million.json')

Connecting to MySQL database...
connection established.
('last insert id', 10000)
('last insert id', 20000)
('last insert id', 30000)
('last insert id', 40000)
('last insert id', 50000)
('last insert id', 60000)
('last insert id', 70000)
('last insert id', 80000)
('last insert id', 90000)
('last insert id', 100000)


In [179]:
(Canopy 64bit) C:\xampp\htdocs\projGreen_ver1\twitter analytics>pip install https://cdn.mysql.com/Download
s/Connector-Python/mysql-connector-python-1.0.12.tar.gz

SyntaxError: invalid syntax (<ipython-input-179-cf71dd30d3a1>, line 1)

In [166]:
from mysql.connector import MySQLConnection, Error
conn = mysql.connector.connect(host= "localhost",
                  user="root",
                  passwd="",
                  db="aircasting_development")
x = conn.cursor()

try:
 
  # query = """INSERT INTO tweetMelbourne(created_at,geo_coordinates_latitude,geo_coordinates_longitude,user_id,polarity_score) \
    #        VALUES(%s,%s,%s,%s,%s)"""
   #args = ('arjun',99.77,108.99,999,1)
   
   x.execute("Select * from tweetMelbourne")
   rows = x.fetchall()
   for row in rows:
     print(row)
except:
   print "hi"
   conn.rollback()

conn.close()

(u'538838165868797953', 1, u'Point', datetime.datetime(2014, 11, 29, 23, 33, 24), Decimal('-37.666521920'), Decimal('144.846125430'), u'Melbourne Airport, Melbourne', u'Australia', u'neighborhood', u'Polygon', Decimal('-37.710380970'), Decimal('144.797614920'), Decimal('-37.641009960'), Decimal('144.874238040'), u'AU', u'Melbourne Airport', u'wheels down, Oz. #gdaymate ????????????', 1081, u'1', 248876, 8891, 13755, u'Silicon Valley, CA', u'0', u'1', u'en', 0)
(u'539330441057013760', 2, u'Point', datetime.datetime(2014, 12, 1, 8, 9, 32), Decimal('-37.823909770'), Decimal('144.991198050'), u'Melbourne, Victoria', u'Australia', u'city', u'Polygon', Decimal('-38.433859306'), Decimal('144.593741856'), Decimal('-37.511273723'), Decimal('145.512528832'), u'AU', u'Melbourne', u'hello Melbourne geeks / Startup Victoria! cc @ga @AngelCubeMelb @500Startups http://t.co/x2YIEunWUc', 1081, u'1', 249055, 8893, 13761, u'Silicon Valley, CA', u'0', u'1', u'en', 1)


NameError: name 'tweet' is not defined