## Gender Classification Of Names
### Using Machine Learning To Detect/Predict Gender of Individuals 
+ Sklearn
+ Pandas
+ Text Extraction

In [1]:
# EDA packages
import pandas as pd
import numpy as np


In [2]:
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:
# Load our data
df = pd.read_csv('names_dataset.csv')

In [4]:
df.head()

Unnamed: 0,index,name,sex
0,0,Mary,F
1,1,Anna,F
2,2,Emma,F
3,3,Elizabeth,F
4,4,Minnie,F


In [5]:
df.size

285075

In [6]:
# Data Cleaning
# Checking for column name consistency
df.columns

Index(['index', 'name', 'sex'], dtype='object')

In [7]:
# Data Types
df.dtypes

index     int64
name     object
sex      object
dtype: object

In [8]:
# Checking for Missing Values
df.isnull().isnull().sum()

index    0
name     0
sex      0
dtype: int64

In [9]:
# Number of Female Names
df[df.sex == 'F'].size

181800

In [10]:
# Number of Male Names
df[df.sex == 'M'].size

103275

In [11]:
df_names = df

In [12]:
# Replacing All F and M with 0 and 1 respectively
df_names.sex.replace({'F':0,'M':1},inplace=True)

In [13]:
df_names.sex.unique()

array([0, 1], dtype=int64)

In [14]:
df_names.dtypes

index     int64
name     object
sex       int64
dtype: object

In [15]:
Xfeatures =df_names['name']

In [16]:
# Feature Extraction 
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)

In [17]:
cv.get_feature_names()

['aaban',
 'aabha',
 'aabid',
 'aabriella',
 'aada',
 'aadam',
 'aadan',
 'aadarsh',
 'aaden',
 'aadesh',
 'aadhav',
 'aadhavan',
 'aadhi',
 'aadhira',
 'aadhvik',
 'aadhya',
 'aadhyan',
 'aadi',
 'aadian',
 'aadil',
 'aadin',
 'aadish',
 'aadison',
 'aadit',
 'aadith',
 'aadithya',
 'aaditri',
 'aaditya',
 'aadiv',
 'aadon',
 'aadrian',
 'aadrika',
 'aadrit',
 'aadvik',
 'aadvika',
 'aadya',
 'aadyn',
 'aafia',
 'aafreen',
 'aagam',
 'aage',
 'aagot',
 'aahaan',
 'aahan',
 'aahana',
 'aahil',
 'aahir',
 'aahliyah',
 'aahna',
 'aahron',
 'aaidan',
 'aaiden',
 'aaidyn',
 'aaila',
 'aailiyah',
 'aailyah',
 'aaima',
 'aaira',
 'aairah',
 'aaisha',
 'aaishah',
 'aaiyana',
 'aaiza',
 'aaja',
 'aajah',
 'aajaylah',
 'aajon',
 'aakanksha',
 'aakarsh',
 'aakash',
 'aakeem',
 'aakilah',
 'aakira',
 'aakiyah',
 'aakriti',
 'aala',
 'aalaiya',
 'aalaiyah',
 'aalana',
 'aalanah',
 'aalani',
 'aalap',
 'aalaya',
 'aalayah',
 'aalayiah',
 'aalayjah',
 'aalayna',
 'aalaysha',
 'aalaysia',
 'aalea',
 

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
# Features 
X
# Labels
y = df_names.sex

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [21]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)


0.6398163206734908

In [22]:
# Accuracy of our Model
print("Accuracy of Model",clf.score(X_test,y_test)*100,"%")

Accuracy of Model 63.98163206734908 %


In [23]:
# Accuracy of our Model
print("Accuracy of Model",clf.score(X_train,y_train)*100,"%")

Accuracy of Model 100.0 %


### Sample Prediction

In [24]:
# Sample1 Prediction
sample_name = ["Mary"]
vect = cv.transform(sample_name).toarray()

In [25]:
vect

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [26]:
# Female is 0, Male is 1
clf.predict(vect)

array([0], dtype=int64)

In [27]:
# Sample2 Prediction
sample_name1 = ["Mark"]
vect1 = cv.transform(sample_name1).toarray()

In [28]:
clf.predict(vect1)

array([1], dtype=int64)

In [29]:
# Sample3 Prediction of Russian Names
sample_name2 = ["Natasha"]
vect2 = cv.transform(sample_name2).toarray()

In [30]:
clf.predict(vect2)

array([0], dtype=int64)

In [31]:
# Sample3 Prediction of Random Names
sample_name3 = ["Nefertiti","Nasha","Ama","Ayo","Xhavier","Ovetta","Tathiana","Xia","Joseph","Xianliang"]
vect3 = cv.transform(sample_name3).toarray()

In [32]:
clf.predict(vect3)

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

In [33]:
# A function to do it
def genderpredictor(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")
    

In [34]:
genderpredictor("Martha")

Female


In [35]:
namelist = ["Yaa","Yaw","Femi","Masha"]
for i in namelist:
    print(genderpredictor(i))

Female
None
Male
None
Female
None
Female
None


### Using a custom function for feature analysis

In [36]:
# By Analogy most female names ends in 'A' or 'E' or has the sound of 'A'
def features(name):
    name = name.lower()
    return {
        'first-letter': name[0], # First letter
        'first2-letters': name[0:2], # First 2 letters
        'first3-letters': name[0:3], # First 3 letters
        'last-letter': name[-1],
        'last2-letters': name[-2:],
        'last3-letters': name[-3:],
    }

In [37]:
# Vectorize the features function
features = np.vectorize(features)
print(features(["Anna", "Hannah", "Peter","John","Vladmir","Mohammed"]))

[{'first-letter': 'a', 'first2-letters': 'an', 'first3-letters': 'ann', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'nna'}
 {'first-letter': 'h', 'first2-letters': 'ha', 'first3-letters': 'han', 'last-letter': 'h', 'last2-letters': 'ah', 'last3-letters': 'nah'}
 {'first-letter': 'p', 'first2-letters': 'pe', 'first3-letters': 'pet', 'last-letter': 'r', 'last2-letters': 'er', 'last3-letters': 'ter'}
 {'first-letter': 'j', 'first2-letters': 'jo', 'first3-letters': 'joh', 'last-letter': 'n', 'last2-letters': 'hn', 'last3-letters': 'ohn'}
 {'first-letter': 'v', 'first2-letters': 'vl', 'first3-letters': 'vla', 'last-letter': 'r', 'last2-letters': 'ir', 'last3-letters': 'mir'}
 {'first-letter': 'm', 'first2-letters': 'mo', 'first3-letters': 'moh', 'last-letter': 'd', 'last2-letters': 'ed', 'last3-letters': 'med'}]


In [38]:
# Extract the features for the dataset
df_X = features(df_names['name'])

In [39]:
df_y = df_names['sex']

In [40]:
from sklearn.feature_extraction import DictVectorizer
 
corpus = features(["Mike", "Julia"])
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)
print(transformed)
 

  (0, 1)	1.0
  (0, 3)	1.0
  (0, 5)	1.0
  (0, 7)	1.0
  (0, 9)	1.0
  (0, 10)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 4)	1.0
  (1, 6)	1.0
  (1, 8)	1.0
  (1, 11)	1.0


In [41]:
dv.get_feature_names()

['first-letter=j',
 'first-letter=m',
 'first2-letters=ju',
 'first2-letters=mi',
 'first3-letters=jul',
 'first3-letters=mik',
 'last-letter=a',
 'last-letter=e',
 'last2-letters=ia',
 'last2-letters=ke',
 'last3-letters=ike',
 'last3-letters=lia']

In [42]:
# Train Test Split
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(df_X, df_y, test_size=0.33, random_state=42)

In [43]:
dfX_train

array([{'first-letter': 'e', 'first2-letters': 'el', 'first3-letters': 'ele', 'last-letter': 'a', 'last2-letters': 'ia', 'last3-letters': 'nia'},
       {'first-letter': 'a', 'first2-letters': 'ad', 'first3-letters': 'adi', 'last-letter': 'l', 'last2-letters': 'il', 'last3-letters': 'dil'},
       {'first-letter': 'k', 'first2-letters': 'ka', 'first3-letters': 'kad', 'last-letter': 'e', 'last2-letters': 'ze', 'last3-letters': 'nze'},
       ...,
       {'first-letter': 'j', 'first2-letters': 'ja', 'first3-letters': 'jaz', 'last-letter': 'y', 'last2-letters': 'ly', 'last3-letters': 'zly'},
       {'first-letter': 'e', 'first2-letters': 'el', 'first3-letters': 'elv', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'ina'},
       {'first-letter': 'l', 'first2-letters': 'le', 'first3-letters': 'led', 'last-letter': 'r', 'last2-letters': 'er', 'last3-letters': 'ger'}],
      dtype=object)

In [44]:

dv = DictVectorizer()
dv.fit_transform(dfX_train)


<63666x8194 sparse matrix of type '<class 'numpy.float64'>'
	with 381996 stored elements in Compressed Sparse Row format>

In [45]:
# Model building Using DecisionTree

from sklearn.tree import DecisionTreeClassifier
 
dclf = DecisionTreeClassifier()
my_xfeatures =dv.transform(dfX_train)
dclf.fit(my_xfeatures, dfy_train)


DecisionTreeClassifier()

In [46]:
# Build Features and Transform them
sample_name_eg = ["Alex"]
transform_dv =dv.transform(features(sample_name_eg))


In [47]:
vect3 = transform_dv.toarray()

In [48]:
# Predicting Gender of Name
# Male is 1,female = 0
dclf.predict(vect3)

array([1], dtype=int64)

In [49]:
if dclf.predict(vect3) == 0:
    print("Female")
else:
    print("Male")

Male


In [50]:
# Second Prediction With Nigerian Name
name_eg1 = ["Chioma"]
transform_dv =dv.transform(features(name_eg1))
vect4 = transform_dv.toarray()
if dclf.predict(vect4) == 0:
    print("Female")
else:
    print("Male")

Female


In [51]:
# A function to do it
def genderpredictor1(a):
    test_name1 = [a]
    transform_dv =dv.transform(features(test_name1))
    vector = transform_dv.toarray()
    if dclf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")
    

In [52]:
random_name_list = ["Alex","Alice","Chioma","Vitalic","Clairese","Chan"]

In [53]:
for n in random_name_list:
    print(genderpredictor1(n))

Male
None
Female
None
Female
None
Male
None
Female
None
Male
None


In [54]:
## Accuracy of Models Decision Tree Classifier Works better than Naive Bayes
# Accuracy on training set
print(dclf.score(dv.transform(dfX_train), dfy_train)) 
 

0.9888951716771903


In [55]:
# Accuracy on test set
print(dclf.score(dv.transform(dfX_test), dfy_test))

0.8670238209126566


In [56]:
dataf = pd.read_csv('new.csv')

In [57]:
dataf

Unnamed: 0.1,Unnamed: 0,review_author,review_Country,review_headline,review_rating,Gender
0,0,Rob,GB,Attrocious company that made mistake after mis...,1,male
1,1,S. L,GB,Missing box,1,unknown
2,2,Imran Alvi,GB,Very bad service,1,unknown
3,3,Jonatan Karlsson,GB,Complete Admin Chaos,1,unknown
4,4,Francesco Merletti,GB,"Boxes returned smashed, ripped and smelly",2,unknown
...,...,...,...,...,...,...
265,265,J,GB,Poor service.,1,unknown
266,266,Maria,GB,Extremely bad service,1,female
267,267,Thelma H.,GB,Labels Labels LABELS!,1,unknown
268,268,Panos,GB,Not bad,3,male


In [58]:
list1 = dataf['review_author']
list2 = []
for i in list1:
    list2.append(genderpredictor1(i))


Male
Male
Female
Male
Female
Female
Male
Female
Female
Female
Female
Male
Male
Male
Female
Male
Female
Male
Male
Male
Male
Female
Male
Male
Female
Male
Male
Female
Female
Male
Female
Female
Male
Female
Female
Female
Male
Male
Male
Male
Male
Male
Male
Female
Male
Male
Male
Male
Male
Male
Male
Male
Male
Female
Female
Male
Female
Female
Female
Female
Female
Male
Male
Female
Male
Female
Female
Male
Male
Male
Male
Male
Male
Female
Male
Female
Male
Male
Female
Male
Male
Female
Male
Male
Male
Male
Male
Male
Male
Female
Male
Male
Male
Male
Male
Female
Female
Male
Female
Female
Male
Female
Male
Male
Male
Male
Male
Male
Female
Male
Male
Male
Male
Female
Male
Female
Male
Female
Male
Female
Male
Female
Female
Female
Male
Female
Male
Male
Female
Male
Female
Female
Male
Female
Female
Male
Female
Female
Male
Male
Male
Male
Female
Female
Male
Male
Male
Female
Male
Male
Male
Female
Female
Female
Male
Male
Female
Female
Female
Female
Male
Female
Male
Female
Male
Male
Male
Male
Male
Female
Female
Male
Ma

In [59]:
list2

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

### Saving Our Model

In [60]:
decisiontreModel = open("decisiontreemodel.pkl","wb")

In [61]:
decisiontreModel.close

<function BufferedWriter.close>

In [62]:
#Alternative to Model Saving
import pickle
dctreeModel = open("namesdetectormodel.pkl","wb")

In [63]:
pickle.dump(dclf,dctreeModel)

In [64]:
dctreeModel.close()

##### Save Multinomial NB Model

In [65]:
NaiveBayesModel = open("naivebayesgendermodel.pkl","wb")

In [66]:
NaiveBayesModel.close()

In [67]:
#listt

In [68]:
#listt = [dataf['review_author']]

In [69]:
'''import requests, json 

def getGenders(names):
	url = ""
	cnt = 0
	#if not isinstance(names,list):
		#names = [names,]
	
	for name in names:
		if url == "":
			url = "name[0]=" + name
		else:
			cnt += 1
			url = url + "&name[" + str(cnt) + "]=" + name
		

	req = requests.get("https://api.genderize.io?" + url)
	results = json.loads(req.text)
	
	retrn = []
	for result in results:
		if result["gender"] is not None:
			retrn.append((result["gender"], result["probability"], result["count"]))
		else:
			retrn.append((u'None',u'0.0',0.0))
	return retrn

if __name__ == '__main__':
	print(getGenders(listt))'''

'import requests, json \n\ndef getGenders(names):\n\turl = ""\n\tcnt = 0\n\t#if not isinstance(names,list):\n\t\t#names = [names,]\n\t\n\tfor name in names:\n\t\tif url == "":\n\t\t\turl = "name[0]=" + name\n\t\telse:\n\t\t\tcnt += 1\n\t\t\turl = url + "&name[" + str(cnt) + "]=" + name\n\t\t\n\n\treq = requests.get("https://api.genderize.io?" + url)\n\tresults = json.loads(req.text)\n\t\n\tretrn = []\n\tfor result in results:\n\t\tif result["gender"] is not None:\n\t\t\tretrn.append((result["gender"], result["probability"], result["count"]))\n\t\telse:\n\t\t\tretrn.append((u\'None\',u\'0.0\',0.0))\n\treturn retrn\n\nif __name__ == \'__main__\':\n\tprint(getGenders(listt))'

In [70]:
dfg = pd.read_csv('aadi.csv')

In [71]:
#dfg = dfg.drop(['Unnamed: 0', 'review_author'], axis = 1)

In [72]:
#d = pd.concat([dfg, yo])

In [73]:
#yo['review_date'] = dfg['review_date']

In [74]:
#yo.to_csv('rev.csv')

In [75]:
#dfg = dfg.iloc[0:1072, :]

In [76]:
dfg

Unnamed: 0.1,Unnamed: 0,review_author,review_headline,review_text,review_rating,review_date
0,0,Chris,We have our pax wardrobes from July,"We have our pax wardrobes from July, no mesh b...",1,2021-11-04T21:27:43.000Z
1,1,Shaf,System states my order has been picked…,System states my order has been picked up from...,1,2021-11-04T10:56:05.000Z
2,2,andrew williams,Never get anything on line and expect a…,Never get anything on line and expect a delive...,1,2021-11-04T09:51:00.000Z
3,3,Rachel Bosshard,IKEA delivery,I know many of you might be IKEA fans so I tho...,1,2021-11-03T15:51:12.000Z
4,4,Leanne Batt,The quality of the customer service is…,The quality of the customer service is as low ...,1,2021-11-03T13:40:28.000Z
...,...,...,...,...,...,...
5075,5075,Yun-Jui Tseng,Requested a cancellation due to several…,Requested a cancellation due to several delays...,1,2021-10-31T07:38:30.000Z
5076,5076,X.Li,good customer service,good customer service,5,2021-10-30T21:38:40.000Z
5077,5077,shisong jiang,Worst customer services.,Worst customer services.Ordered a set of wardr...,1,2021-10-30T13:32:14.000Z
5078,5078,Theresa,Excellent customer service from Kai,"Excellent customer service from Kai, he was so...",5,2021-10-30T09:08:35.000Z


In [77]:
dfg.columns

Index(['Unnamed: 0', 'review_author', 'review_headline', 'review_text',
       'review_rating', 'review_date'],
      dtype='object')

In [78]:
''''lista = dfg['review_author']
l2  = []
for i in lista:
    l2.append(genderpredictor1(i))'''

"'lista = dfg['review_author']\nl2  = []\nfor i in lista:\n    l2.append(genderpredictor1(i))"

In [79]:
#dfg = dfg.drop(['Unnamed: 0', 'review_headline', 'review_text',
   #    'review_rating', 'review_date'], axis = 1)

In [80]:
#len(ll)

In [81]:
#type(dfg['review_author'])

In [82]:
'''Name = dfg['review_author'].tolist()
Name
yup = []
for i in Name:
    yup.append(genderpredictor1(i))'''

"Name = dfg['review_author'].tolist()\nName\nyup = []\nfor i in Name:\n    yup.append(genderpredictor1(i))"

In [83]:
#Name

In [84]:
#for i in n2:
 #   print(n2[i])

In [85]:
#li = lista[0:1072]

In [86]:
'''dic = {
    'names': li,
    'gender': ll
}'''

"dic = {\n    'names': li,\n    'gender': ll\n}"

In [87]:
#yo = pd.DataFrame(dic)

In [88]:
#yo.to_csv('names_gen.csv')


In [89]:
#p1 = pd.read_csv('Csv.csv')

In [90]:
#p1

In [91]:
#lmm = [p1['Name']]

In [92]:
#len(lmm)

In [93]:
'''lp  = []
for i in lo:
    lp.append(genderpredictor1(i))'''

'lp  = []\nfor i in lo:\n    lp.append(genderpredictor1(i))'

In [94]:
csv = pd.read_csv('Csv.csv')

In [95]:
authors = csv['Name'].tolist()

In [96]:
authors

['Rob',
 'Sabine L',
 'Imran Alvi',
 'Jonatan Karlsson',
 'Ewa',
 'ekin örnek',
 'Martina Nedkova',
 'aseel sultan',
 'Sylvie',
 'Alexandra Badut',
 'Lawsuit',
 'Linus',
 'Marilyn Ohemaa',
 'Rafeeda Abedin',
 'Maria Vittoria',
 'AP',
 'Nathalie Clark',
 'Clau Ramos',
 'Laura Biezup',
 'Scarlett,',
 'Tabitha Mullock',
 'Christopher Onderstall',
 'Ann-Michelle Mull',
 'Alex G',
 'Julia',
 'Shen May Khoo',
 'Joana',
 'jorge kronfle',
 'Zaya G',
 'Claudia Hon',
 'Jonty',
 'Claudia Hon',
 'Julie',
 '\xa0Hyunhoi',
 'Afsana Mahtani',
 'Antonella',
 '\xa0Hyunhoi,',
 'Manu Moreau',
 'Khalad Al-Muhaysh',
 'Selina Abdul Kareem',
 'Srika N.',
 'Fanni Szabo',
 'Aiwen Chua',
 'Katrina So',
 'Sang Hun Kim',
 'Rena Abidova',
 'G O-I',
 'Kittipit Viseshsin',
 'Iustina Chirila',
 'Yara Ha',
 'Raluca Baicu',
 'Burcu',
 'Yana',
 'Yana Pencheva',
 'Srika N.',
 'Jess',
 'Michal',
 'Max Hombach',
 'Luca',
 'Saksham',
 'Penghui Shi',
 'Panagiotis Gr',
 'Joanna Rie',
 'Odette Duerden',
 'anda oglakci',
 'Rhian

In [97]:
len(authors[15])

2

In [98]:
type(authors)

list

In [99]:
len('aditya')

6

In [100]:
authors = str(authors)

In [101]:
authors

"['Rob', 'Sabine L', 'Imran Alvi', 'Jonatan Karlsson', 'Ewa', 'ekin örnek', 'Martina Nedkova', 'aseel sultan', 'Sylvie', 'Alexandra Badut', 'Lawsuit', 'Linus', 'Marilyn Ohemaa', 'Rafeeda Abedin', 'Maria Vittoria', 'AP', 'Nathalie Clark', 'Clau Ramos', 'Laura Biezup', 'Scarlett,', 'Tabitha Mullock', 'Christopher Onderstall', 'Ann-Michelle Mull', 'Alex G', 'Julia', 'Shen May Khoo', 'Joana', 'jorge kronfle', 'Zaya G', 'Claudia Hon', 'Jonty', 'Claudia Hon', 'Julie', '\\xa0Hyunhoi', 'Afsana Mahtani', 'Antonella', '\\xa0Hyunhoi,', 'Manu Moreau', 'Khalad Al-Muhaysh', 'Selina Abdul Kareem', 'Srika N.', 'Fanni Szabo', 'Aiwen Chua', 'Katrina So', 'Sang Hun Kim', 'Rena Abidova', 'G O-I', 'Kittipit Viseshsin', 'Iustina Chirila', 'Yara Ha', 'Raluca Baicu', 'Burcu', 'Yana', 'Yana Pencheva', 'Srika N.', 'Jess', 'Michal', 'Max Hombach', 'Luca', 'Saksham', 'Penghui Shi', 'Panagiotis Gr', 'Joanna Rie', 'Odette Duerden', 'anda oglakci', 'Rhiannon', 'Анастасия Шилова', 'Natasha', 'Patricia Eliana Gheorghe

In [102]:
#abbre = []
for i in range(0,204):
    authors[i] = str(authors[i])

TypeError: 'str' object does not support item assignment

In [None]:
#abbres = []
for i in range(0,204):
    if authors[i] == "武文轩":
        print(authors[i])
        ab.append(authors[i])

In [None]:
for i in abbre:
    abbres.append(i)

In [None]:
abrrevations = pd.DataFrame(abbres)

In [None]:
ab = abrrevations[0].tolist()

In [None]:
ab

In [None]:
abbre1.append(abbre)

In [None]:
abbre1

In [None]:
import re
mytext = "ML how are YU"
mytext = re.findall(r"\b[A-Z]{2,}\b", "", mytext)


In [None]:
fn = []
for i in authors:
    a = i.split()
    fn.append(a)

In [None]:
m = []
for i in authors:
   
    a = i.split()
    m.append(a[0])
    

In [None]:
m

In [None]:
#abv2 = []
for name in m:
    if len(name)==2:
        abv2.append(name)
            

In [None]:
abv2.append(ab[10])

In [None]:
abv2

In [None]:
abv1

In [None]:
for i in ab2:
    ab1.append(i)

In [None]:
abbrevations = pd.DataFrame()

In [None]:
abbrevations['Abbrevation'] = ab1

In [None]:
csv

In [None]:
abbrevations = abbrevations.drop('Abbrevation', axis = 1)

In [None]:
abbrevations.to_csv('Abbre.csv')


In [None]:
ABV = pd.DataFrame(abv2)

In [None]:
ABV.to_csv('ABV.csv')

In [103]:
dc = pd.read_csv('output.csv')

In [104]:
dp = dc[dc['body'].notna()]

In [113]:
dp1 = dp[dp['rating']!=5]

In [114]:
li1 = dp1['title']
li2 = []
for i in li1:
    li2.append(genderpredictor1(i))


Male
Male
Male
Male
Male
Female
Male
Female
Female
Male
Female
Female
Female
Male
Male
Male
Male
Male
Female
Female
Female
Male
Female
Female
Male
Male
Male
Male
Female
Male
Male
Male
Male
Male
Female
Male
Male
Male
Female
Male
Female
Female
Male
Female
Male
Male
Female
Female
Male
Male
Male
Female
Male
Male
Male
Male
Female
Male
Male
Male
Male
Male
Female
Male
Male
Male
Female
Male
Male
Male
Female
Female
Male
Female
Male
Male
Male
Female
Male
Male
Male
Male
Male
Female
Male
Male
Female
Male
Male
Female
Male
Male
Male
Male
Male
Male
Female
Male
Female
Male
Male
Male
Male
Male
Male
Male
Female
Female
Male
Male
Female
Male
Male
Male
Male
Male
Male
Male
Male
Female
Male
Female
Female
Female
Female
Female
Male
Male
Male
Male
Male
Male
Male
Female
Male
Male
Female
Male
Male
Female
Male
Male
Female
Female
Male
Female


In [115]:
len(li1)

146

In [116]:
gen = ['Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Female',
'Female',
'Male',
'Female',
'Female',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Female',
'Female',
'Male',
'Female',
'Female',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Female',
'Male',
'Female',
'Female',
'Male',
'Female',
'Male',
'Male',
'Female',
'Female',
'Male',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Female',
'Female',
'Male',
'Female',
'Male',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Male',
'Female',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Female',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Female',
'Female',
'Female',
'Female',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Male',
'Female',
'Male',
'Male',
'Female',
'Male',
'Male',
'Female',
'Female',
'Male',
'Female',
]

In [105]:
dp['title']

0      Francesca Carioti
1            Helen Zhang
2             Mark Aduol
3       Ola Sidorkiewicz
4        Farrel Adhitama
             ...        
276                  范泽维
277       deok young lee
278              Siyu Su
279           Lokyi Tsoi
280       mulugeta yimer
Name: title, Length: 281, dtype: object

In [118]:
dp1['gender'] = gen

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dp1['gender'] = gen


In [120]:
dp1 = dp1[dp1['rating'].notna()]

In [122]:
dp1 = dp1[dp1['rating']!=4]

In [124]:
dp1.to_csv('google data.csv')

In [106]:
list9 = dp['title']
list10 = []
for i in list9:
    list10.append(genderpredictor1(i))


Female
Male
Male
Male
Female
Female
Male
Male
Female
Male
Male
Female
Female
Male
Male
Female
Female
Male
Female
Male
Female
Female
Male
Male
Female
Male
Male
Female
Male
Male
Male
Male
Male
Female
Female
Male
Female
Male
Female
Female
Female
Male
Male
Male
Male
Male
Female
Male
Male
Male
Male
Male
Male
Female
Male
Female
Male
Male
Male
Female
Male
Female
Female
Female
Female
Male
Female
Male
Male
Female
Female
Male
Male
Male
Female
Female
Male
Male
Male
Female
Male
Female
Female
Male
Female
Male
Male
Male
Male
Male
Male
Male
Male
Male
Male
Male
Male
Female
Male
Female
Male
Male
Female
Female
Male
Female
Male
Male
Female
Female
Male
Male
Male
Male
Female
Female
Male
Male
Male
Female
Female
Female
Female
Male
Male
Male
Male
Male
Male
Female
Male
Female
Female
Male
Male
Female
Male
Male
Male
Male
Male
Male
Male
Male
Female
Male
Male
Male
Male
Male
Male
Male
Male
Female
Male
Male
Female
Male
Male
Male
Male
Male
Male
Female
Male
Male
Male
Male
Male
Female
Female
Male
Male
Male
Male
Male
Ma

In [107]:
list11 = ['Female',
'Male',
'Male',
'Male',
'Female',
'Female',
'Male',
'Male',
'Female',
'Male',
'Male',
'Female',
'Female',
'Male',
'Male',
'Female',
'Female',
'Male',
'Female',
'Male',
'Female',
'Female',
'Male',
'Male',
'Female',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Female',
'Male',
'Female',
'Male',
'Female',
'Female',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Female',
'Male',
'Male',
'Male',
'Female',
'Male',
'Female',
'Female',
'Female',
'Female',
'Male',
'Female',
'Male',
'Male',
'Female',
'Female',
'Male',
'Male',
'Male',
'Female',
'Female',
'Male',
'Male',
'Male',
'Female',
'Male',
'Female',
'Female',
'Male',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Female',
'Male',
'Male',
'Female',
'Female',
'Male',
'Female',
'Male',
'Male',
'Female',
'Female',
'Male',
'Male',
'Male',
'Male',
'Female',
'Female',
'Male',
'Male',
'Male',
'Female',
'Female',
'Female',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Female',
'Female',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Female',
'Female',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Female',
'Female',
'Male',
'Male',
'Female',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Male',
'Female',
'Female',
'Male',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Male',
'Female',
'Female',
'Female',
'Female',
'Male',
'Female',
'Male',
'Female',
'Male',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Male',
'Male',
'Male',
'Female',
'Male',
'Male',
'Female',
'Male',
'Male',
'Male',
'Female',
'Male',
'Male',
'Female',
'Male',
'Female',
'Male',
'Male',
'Female',
'Male',
'Female',
'Male',
'Female',
'Male']

In [108]:
len(list11)

281

In [109]:
dp['gender'] = list11

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dp['gender'] = list11


In [112]:
dp.to_csv('lov2_o.csv')